updated patch to bring in the first functions with "transparent API"

pull/1824/head
Vadim Pisarevsky 12 years ago
parent bb4bf7a1f9
commit d914f20a4c
  1. 3
      cmake/OpenCVModule.cmake
  2. 29
      cmake/cl2cpp.cmake
  3. 1
      modules/bioinspired/src/precomp.hpp
  4. 2
      modules/bioinspired/src/retina_ocl.cpp
  5. 4
      modules/core/include/opencv2/core.hpp
  6. 14
      modules/core/include/opencv2/core/mat.hpp
  7. 6
      modules/core/include/opencv2/core/mat.inl.hpp
  8. 133
      modules/core/include/opencv2/core/ocl.hpp
  9. 60
      modules/core/include/opencv2/core/ocl_genbase.hpp
  10. 459
      modules/core/src/arithm.cpp
  11. 15
      modules/core/src/copy.cpp
  12. 253
      modules/core/src/matrix.cpp
  13. 300
      modules/core/src/ocl.cpp
  14. 307
      modules/core/src/opencl/arithm.cl
  15. 74
      modules/core/src/opencl/copyset.cl
  16. 96
      modules/core/src/opencl/mulspectrums.cl
  17. 73
      modules/core/src/opencl/polarcart.cl
  18. 104
      modules/core/src/opencl/reductions.cl
  19. 30
      modules/core/src/precomp.hpp
  20. 50
      modules/core/src/umatrix.cpp
  21. 73
      modules/core/test/test_umat.cpp
  22. 5
      modules/highgui/include/opencv2/highgui.hpp
  23. 10
      modules/highgui/src/cap.cpp
  24. 126
      modules/imgproc/src/color.cpp
  25. 54
      modules/imgproc/src/imgwarp.cpp
  26. 145
      modules/imgproc/src/opencl/bilateral.cl
  27. 478
      modules/imgproc/src/opencl/boxfilter.cl
  28. 636
      modules/imgproc/src/opencl/canny.cl
  29. 255
      modules/imgproc/src/opencl/clahe.cl
  30. 109
      modules/imgproc/src/opencl/convolve.cl
  31. 134
      modules/imgproc/src/opencl/copymakeborder.cl
  32. 306
      modules/imgproc/src/opencl/cvtcolor.cl
  33. 275
      modules/imgproc/src/opencl/gftt.cl
  34. 202
      modules/imgproc/src/opencl/harris.cl
  35. 279
      modules/imgproc/src/opencl/histogram.cl
  36. 280
      modules/imgproc/src/opencl/hough.cl
  37. 493
      modules/imgproc/src/opencl/integral.cl
  38. 412
      modules/imgproc/src/opencl/integral_sum.cl
  39. 381
      modules/imgproc/src/opencl/laplacian.cl
  40. 857
      modules/imgproc/src/opencl/match_template.cl
  41. 486
      modules/imgproc/src/opencl/median.cl
  42. 207
      modules/imgproc/src/opencl/mineigenval.cl
  43. 980
      modules/imgproc/src/opencl/moments.cl
  44. 228
      modules/imgproc/src/opencl/morph.cl
  45. 1010
      modules/imgproc/src/opencl/pyramid.cl
  46. 323
      modules/imgproc/src/opencl/remap.cl
  47. 152
      modules/imgproc/src/opencl/resize.cl
  48. 152
      modules/imgproc/src/opencl/threshold.cl
  49. 761
      modules/imgproc/src/opencl/warpaffine.cl
  50. 688
      modules/imgproc/src/opencl/warpperspective.cl
  51. 2
      modules/imgproc/src/precomp.hpp
  52. 81
      modules/imgproc/test/test_imgproc_umat.cpp
  53. 2
      modules/nonfree/src/precomp.hpp
  54. 2
      modules/nonfree/src/surf.ocl.cpp
  55. 6
      modules/objdetect/include/opencv2/objdetect.hpp
  56. 9
      modules/objdetect/src/cascadedetect.cpp
  57. 423
      modules/objdetect/src/opencl/haarobjectdetect.cl
  58. 306
      modules/objdetect/src/opencl/haarobjectdetect_scaled2.cl
  59. 1
      modules/objdetect/src/precomp.hpp
  60. 8
      modules/ocl/include/opencv2/ocl/private/util.hpp
  61. 2
      modules/superres/src/btv_l1_ocl.cpp
  62. 1
      modules/superres/src/precomp.hpp
  63. 276
      samples/cpp/ufacedetect.cpp
  64. 32
      samples/ocl/facedetect.cpp

@ -501,9 +501,10 @@ macro(ocv_glob_module_sources)
file(GLOB cl_kernels "src/opencl/*.cl")
if(HAVE_opencv_ocl AND cl_kernels)
ocv_include_directories(${OPENCL_INCLUDE_DIRS})
string(REGEX REPLACE "opencv_" "" the_module_barename "${the_module}")
add_custom_command(
OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.hpp"
COMMAND ${CMAKE_COMMAND} -DCL_DIR="${CMAKE_CURRENT_SOURCE_DIR}/src/opencl" -DOUTPUT="${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" -P "${OpenCV_SOURCE_DIR}/cmake/cl2cpp.cmake"
COMMAND ${CMAKE_COMMAND} -DMODULE_NAME="${the_module_barename}" -DCL_DIR="${CMAKE_CURRENT_SOURCE_DIR}/src/opencl" -DOUTPUT="${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" -P "${OpenCV_SOURCE_DIR}/cmake/cl2cpp.cmake"
DEPENDS ${cl_kernels} "${OpenCV_SOURCE_DIR}/cmake/cl2cpp.cmake")
source_group("OpenCL" FILES ${cl_kernels} "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.hpp")
list(APPEND lib_srcs ${cl_kernels} "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.hpp")

@ -4,6 +4,15 @@ list(SORT cl_list)
string(REPLACE ".cpp" ".hpp" OUTPUT_HPP "${OUTPUT}")
get_filename_component(OUTPUT_HPP_NAME "${OUTPUT_HPP}" NAME)
if("${MODULE_NAME}" STREQUAL "ocl")
set(nested_namespace_start "")
set(nested_namespace_end "")
else()
set(new_mode ON)
set(nested_namespace_start "namespace ${MODULE_NAME}\n{")
set(nested_namespace_end "}")
endif()
set(STR_CPP "// This file is auto-generated. Do not edit!
#include \"precomp.hpp\"
@ -13,16 +22,19 @@ namespace cv
{
namespace ocl
{
${nested_namespace_start}
")
set(STR_HPP "// This file is auto-generated. Do not edit!
#include \"opencv2/ocl/private/util.hpp\"
#include \"opencv2/core/ocl_genbase.hpp\"
namespace cv
{
namespace ocl
{
${nested_namespace_start}
")
@ -49,12 +61,19 @@ foreach(cl ${cl_list})
string(MD5 hash "${lines}")
set(STR_CPP "${STR_CPP}const struct ProgramEntry ${cl_filename}={\"${cl_filename}\",\n\"${lines}, \"${hash}\"};\n")
set(STR_HPP "${STR_HPP}extern const struct ProgramEntry ${cl_filename};\n")
set(STR_CPP_DECL "const struct ProgramEntry ${cl_filename}={\"${cl_filename}\",\n\"${lines}, \"${hash}\"};\n")
set(STR_HPP_DECL "extern const struct ProgramEntry ${cl_filename};\n")
if(new_mode)
set(STR_CPP_DECL "${STR_CPP_DECL}ProgramSource2 ${cl_filename}_oclsrc(${cl_filename}.programStr);\n")
set(STR_HPP_DECL "${STR_HPP_DECL}extern ProgramSource2 ${cl_filename}_oclsrc;\n")
endif()
set(STR_CPP "${STR_CPP}${STR_CPP_DECL}")
set(STR_HPP "${STR_HPP}${STR_HPP_DECL}")
endforeach()
set(STR_CPP "${STR_CPP}}\n}\n")
set(STR_HPP "${STR_HPP}}\n}\n")
set(STR_CPP "${STR_CPP}}\n${nested_namespace_end}}\n")
set(STR_HPP "${STR_HPP}}\n${nested_namespace_end}}\n")
file(WRITE "${OUTPUT}" "${STR_CPP}")

@ -47,6 +47,7 @@
#include "opencv2/bioinspired.hpp"
#include "opencv2/core/utility.hpp"
#include "opencv2/core/private.hpp"
#include "opencv2/core/ocl.hpp"
#include <valarray>

@ -56,6 +56,8 @@
namespace cv
{
static ocl::ProgramEntry retina_kernel = ocl::bioinspired::retina_kernel;
namespace bioinspired
{
namespace ocl

@ -347,6 +347,10 @@ CV_EXPORTS_W void max(InputArray src1, InputArray src2, OutputArray dst);
CV_EXPORTS void min(const Mat& src1, const Mat& src2, Mat& dst);
//! computes per-element maximum of two arrays (dst = max(src1, src2))
CV_EXPORTS void max(const Mat& src1, const Mat& src2, Mat& dst);
//! computes per-element minimum of two arrays (dst = min(src1, src2))
CV_EXPORTS void min(const UMat& src1, const UMat& src2, UMat& dst);
//! computes per-element maximum of two arrays (dst = max(src1, src2))
CV_EXPORTS void max(const UMat& src1, const UMat& src2, UMat& dst);
//! computes square root of each matrix element (dst = src**0.5)
CV_EXPORTS_W void sqrt(InputArray src, OutputArray dst);

@ -58,6 +58,8 @@ namespace cv
enum { ACCESS_READ=1<<24, ACCESS_WRITE=1<<25,
ACCESS_RW=3<<24, ACCESS_MASK=ACCESS_RW, ACCESS_FAST=1<<26 };
class CV_EXPORTS _OutputArray;
//////////////////////// Input/Output Array Arguments /////////////////////////////////
/*!
@ -116,12 +118,22 @@ public:
void* getObj() const;
virtual int kind() const;
virtual int dims(int i=-1) const;
virtual Size size(int i=-1) const;
virtual int sizend(int* sz, int i=-1) const;
virtual bool sameSize(const _InputArray& arr) const;
virtual size_t total(int i=-1) const;
virtual int type(int i=-1) const;
virtual int depth(int i=-1) const;
virtual int channels(int i=-1) const;
virtual bool isContinuous(int i=-1) const;
virtual bool empty() const;
virtual void copyTo(const _OutputArray& arr) const;
bool isMat() const;
bool isUMat() const;
bool isMatVectot() const;
bool isUMatVector() const;
bool isMatx();
virtual ~_InputArray();
@ -197,8 +209,10 @@ public:
virtual void create(Size sz, int type, int i=-1, bool allowTransposed=false, int fixedDepthMask=0) const;
virtual void create(int rows, int cols, int type, int i=-1, bool allowTransposed=false, int fixedDepthMask=0) const;
virtual void create(int dims, const int* size, int type, int i=-1, bool allowTransposed=false, int fixedDepthMask=0) const;
virtual void createSameSize(const _InputArray& arr, int mtype) const;
virtual void release() const;
virtual void clear() const;
virtual void setTo(const _InputArray& value) const;
};

@ -108,6 +108,12 @@ inline _InputArray::_InputArray(const cuda::CudaMem& cuda_mem)
inline _InputArray::~_InputArray() {}
inline bool _InputArray::isMat() const { return kind() == _InputArray::MAT; }
inline bool _InputArray::isUMat() const { return kind() == _InputArray::UMAT; }
inline bool _InputArray::isMatVectot() const { return kind() == _InputArray::STD_VECTOR_MAT; }
inline bool _InputArray::isUMatVector() const { return kind() == _InputArray::STD_VECTOR_UMAT; }
inline bool _InputArray::isMatx() { return kind() == _InputArray::MATX; }
////////////////////////////////////////////////////////////////////////////////////////
inline _OutputArray::_OutputArray() { init(ACCESS_WRITE, 0); }

@ -49,13 +49,13 @@ namespace cv { namespace ocl {
CV_EXPORTS bool haveOpenCL();
CV_EXPORTS bool useOpenCL();
CV_EXPORTS void setUseOpenCL(bool flag);
CV_EXPORTS void finish();
CV_EXPORTS void finish2();
class CV_EXPORTS Context;
class CV_EXPORTS Context2;
class CV_EXPORTS Device;
class CV_EXPORTS Kernel;
class CV_EXPORTS Program;
class CV_EXPORTS ProgramSource;
class CV_EXPORTS ProgramSource2;
class CV_EXPORTS Queue;
class CV_EXPORTS Device
@ -199,22 +199,22 @@ protected:
};
class CV_EXPORTS Context
class CV_EXPORTS Context2
{
public:
Context();
explicit Context(int dtype);
~Context();
Context(const Context& c);
Context& operator = (const Context& c);
Context2();
explicit Context2(int dtype);
~Context2();
Context2(const Context2& c);
Context2& operator = (const Context2& c);
bool create(int dtype);
size_t ndevices() const;
const Device& device(size_t idx) const;
Program getProg(const ProgramSource& prog,
Program getProg(const ProgramSource2& prog,
const String& buildopt, String& errmsg);
static Context& getDefault();
static Context2& getDefault();
void* ptr() const;
protected:
struct Impl;
@ -226,12 +226,12 @@ class CV_EXPORTS Queue
{
public:
Queue();
explicit Queue(const Context& c, const Device& d=Device());
explicit Queue(const Context2& c, const Device& d=Device());
~Queue();
Queue(const Queue& q);
Queue& operator = (const Queue& q);
bool create(const Context& c=Context(), const Device& d=Device());
bool create(const Context2& c=Context2(), const Device& d=Device());
void finish();
void* ptr() const;
static Queue& getDefault();
@ -245,41 +245,55 @@ protected:
class CV_EXPORTS KernelArg
{
public:
enum { LOCAL=1, READ_ONLY=2, WRITE_ONLY=4, READ_WRITE=6, CONSTANT=8 };
KernelArg(int _flags, UMat* _m, void* _obj=0, size_t _sz=0);
enum { LOCAL=1, READ_ONLY=2, WRITE_ONLY=4, READ_WRITE=6, CONSTANT=8, NO_SIZE=256 };
KernelArg(int _flags, UMat* _m, int wscale=1, const void* _obj=0, size_t _sz=0);
KernelArg();
static KernelArg Local() { return KernelArg(LOCAL, 0); }
static KernelArg ReadOnly(const UMat& m) { return KernelArg(READ_ONLY, (UMat*)&m); }
static KernelArg WriteOnly(const UMat& m) { return KernelArg(WRITE_ONLY, (UMat*)&m); }
static KernelArg ReadWrite(const UMat& m, int wscale=1)
{ return KernelArg(READ_WRITE, (UMat*)&m, wscale); }
static KernelArg ReadWriteNoSize(const UMat& m, int wscale=1)
{ return KernelArg(READ_WRITE+NO_SIZE, (UMat*)&m, wscale); }
static KernelArg ReadOnly(const UMat& m, int wscale=1)
{ return KernelArg(READ_ONLY, (UMat*)&m, wscale); }
static KernelArg WriteOnly(const UMat& m, int wscale=1)
{ return KernelArg(WRITE_ONLY, (UMat*)&m, wscale); }
static KernelArg ReadOnlyNoSize(const UMat& m, int wscale=1)
{ return KernelArg(READ_ONLY+NO_SIZE, (UMat*)&m, wscale); }
static KernelArg WriteOnlyNoSize(const UMat& m, int wscale=1)
{ return KernelArg(WRITE_ONLY+NO_SIZE, (UMat*)&m, wscale); }
static KernelArg Constant(const Mat& m);
template<typename _Tp> static KernelArg Constant(const _Tp* arr, size_t n)
{ return KernelArg(CONSTANT, 0, (void*)arr, n); }
{ return KernelArg(CONSTANT, 0, 1, (void*)arr, n); }
int flags;
UMat* m;
void* obj;
const void* obj;
size_t sz;
int wscale;
};
class CV_EXPORTS Kernel
{
public:
Kernel();
Kernel(const char* kname, const Program& prog);
Kernel(const char* kname, const ProgramSource& prog,
const String& buildopts, String& errmsg);
Kernel(const char* kname, const ProgramSource2& prog,
const String& buildopts, String* errmsg=0);
~Kernel();
Kernel(const Kernel& k);
Kernel& operator = (const Kernel& k);
bool empty() const;
bool create(const char* kname, const Program& prog);
bool create(const char* kname, const ProgramSource& prog,
const String& buildopts, String& errmsg);
bool create(const char* kname, const ProgramSource2& prog,
const String& buildopts, String* errmsg=0);
void set(int i, const void* value, size_t sz);
void set(int i, const UMat& m);
void set(int i, const KernelArg& arg);
template<typename _Tp> void set(int i, const _Tp& value)
int set(int i, const void* value, size_t sz);
int set(int i, const UMat& m);
int set(int i, const KernelArg& arg);
template<typename _Tp> int set(int i, const _Tp& value)
{ return set(i, &value, sizeof(value)); }
template<typename _Tp0>
@ -291,26 +305,27 @@ public:
template<typename _Tp0, typename _Tp1>
Kernel& args(const _Tp0& a0, const _Tp1& a1)
{
set(0, a0); set(1, a1); return *this;
int i = set(0, a0); set(i, a1); return *this;
}
template<typename _Tp0, typename _Tp1, typename _Tp2>
Kernel& args(const _Tp0& a0, const _Tp1& a1, const _Tp2& a2)
{
set(0, a0); set(1, a1); set(2, a2); return *this;
int i = set(0, a0); i = set(i, a1); set(i, a2); return *this;
}
template<typename _Tp0, typename _Tp1, typename _Tp2, typename _Tp3>
Kernel& args(const _Tp0& a0, const _Tp1& a1, const _Tp2& a2, const _Tp3& a3)
{
set(0, a0); set(1, a1); set(2, a2); set(3, a3); return *this;
int i = set(0, a0); i = set(i, a1); i = set(i, a2); i = set(i, a3); return *this;
}
template<typename _Tp0, typename _Tp1, typename _Tp2, typename _Tp3, typename _Tp4>
Kernel& args(const _Tp0& a0, const _Tp1& a1, const _Tp2& a2,
const _Tp3& a3, const _Tp4& a4)
{
set(0, a0); set(1, a1); set(2, a2); set(3, a3); set(4, a4); return *this;
int i = set(0, a0); i = set(i, a1); i = set(i, a2);
i = set(i, a3); set(i, a4); return *this;
}
template<typename _Tp0, typename _Tp1, typename _Tp2,
@ -318,8 +333,8 @@ public:
Kernel& args(const _Tp0& a0, const _Tp1& a1, const _Tp2& a2,
const _Tp3& a3, const _Tp4& a4, const _Tp5& a5)
{
set(0, a0); set(1, a1); set(2, a2);
set(3, a3); set(4, a4); set(5, a5); return *this;
int i = set(0, a0); i = set(i, a1); i = set(i, a2);
i = set(i, a3); i = set(i, a4); set(i, a5); return *this;
}
template<typename _Tp0, typename _Tp1, typename _Tp2, typename _Tp3,
@ -327,8 +342,8 @@ public:
Kernel& args(const _Tp0& a0, const _Tp1& a1, const _Tp2& a2, const _Tp3& a3,
const _Tp4& a4, const _Tp5& a5, const _Tp6& a6)
{
set(0, a0); set(1, a1); set(2, a2); set(3, a3);
set(4, a4); set(5, a5); set(6, a6); return *this;
int i = set(0, a0); i = set(i, a1); i = set(i, a2); i = set(i, a3);
i = set(i, a4); i = set(i, a5); set(i, a6); return *this;
}
template<typename _Tp0, typename _Tp1, typename _Tp2, typename _Tp3,
@ -336,8 +351,8 @@ public:
Kernel& args(const _Tp0& a0, const _Tp1& a1, const _Tp2& a2, const _Tp3& a3,
const _Tp4& a4, const _Tp5& a5, const _Tp6& a6, const _Tp7& a7)
{
set(0, a0); set(1, a1); set(2, a2); set(3, a3);
set(4, a4); set(5, a5); set(6, a6); set(7, a7); return *this;
int i = set(0, a0); i = set(i, a1); i = set(i, a2); i = set(i, a3);
i = set(i, a4); i = set(i, a5); i = set(i, a6); set(i, a7); return *this;
}
template<typename _Tp0, typename _Tp1, typename _Tp2, typename _Tp3, typename _Tp4,
@ -346,8 +361,8 @@ public:
const _Tp4& a4, const _Tp5& a5, const _Tp6& a6, const _Tp7& a7,
const _Tp8& a8)
{
set(0, a0); set(1, a1); set(2, a2); set(3, a3); set(4, a4);
set(5, a5); set(6, a6); set(7, a7); set(8, a8); return *this;
int i = set(0, a0); i = set(i, a1); i = set(i, a2); i = set(i, a3); i = set(i, a4);
i = set(i, a5); i = set(i, a6); i = set(i, a7); set(i, a8); return *this;
}
template<typename _Tp0, typename _Tp1, typename _Tp2, typename _Tp3, typename _Tp4,
@ -356,8 +371,8 @@ public:
const _Tp4& a4, const _Tp5& a5, const _Tp6& a6, const _Tp7& a7,
const _Tp8& a8, const _Tp9& a9)
{
set(0, a0); set(1, a1); set(2, a2); set(3, a3); set(4, a4); set(5, a5);
set(6, a6); set(7, a7); set(8, a8); set(9, a9); return *this;
int i = set(0, a0); i = set(i, a1); i = set(i, a2); i = set(i, a3); i = set(i, a4); i = set(i, a5);
i = set(i, a6); i = set(i, a7); i = set(i, a8); set(i, a9); return *this;
}
template<typename _Tp0, typename _Tp1, typename _Tp2, typename _Tp3,
@ -367,8 +382,8 @@ public:
const _Tp4& a4, const _Tp5& a5, const _Tp6& a6, const _Tp7& a7,
const _Tp8& a8, const _Tp9& a9, const _Tp10& a10)
{
set(0, a0); set(1, a1); set(2, a2); set(3, a3); set(4, a4); set(5, a5);
set(6, a6); set(7, a7); set(8, a8); set(9, a9); set(10, a10); return *this;
int i = set(0, a0); i = set(i, a1); i = set(i, a2); i = set(i, a3); i = set(i, a4); i = set(i, a5);
i = set(i, a6); i = set(i, a7); i = set(i, a8); i = set(i, a9); set(i, a10); return *this;
}
template<typename _Tp0, typename _Tp1, typename _Tp2, typename _Tp3,
@ -378,13 +393,13 @@ public:
const _Tp4& a4, const _Tp5& a5, const _Tp6& a6, const _Tp7& a7,
const _Tp8& a8, const _Tp9& a9, const _Tp10& a10, const _Tp11& a11)
{
set(0, a0); set(1, a1); set(2, a2); set(3, a3); set(4, a4); set(5, a5);
set(6, a6); set(7, a7); set(8, a8); set(9, a9); set(10, a10); set(11, a11); return *this;
int i = set(0, a0); i = set(i, a1); i = set(i, a2); i = set(i, a3); i = set(i, a4); i = set(i, a5);
i = set(i, a6); i = set(i, a7); i = set(i, a8); i = set(i, a9); i = set(i, a10); set(i, a11); return *this;
}
void run(int dims, size_t offset[], size_t globalsize[],
bool run(int dims, size_t globalsize[],
size_t localsize[], bool sync, const Queue& q=Queue());
void runTask(bool sync, const Queue& q=Queue());
bool runTask(bool sync, const Queue& q=Queue());
size_t workGroupSize() const;
bool compileWorkGroupSize(size_t wsz[]) const;
@ -401,7 +416,7 @@ class CV_EXPORTS Program
{
public:
Program();
Program(const ProgramSource& src,
Program(const ProgramSource2& src,
const String& buildflags, String& errmsg);
explicit Program(const String& buf);
Program(const Program& prog);
@ -409,12 +424,12 @@ public:
Program& operator = (const Program& prog);
~Program();
bool create(const ProgramSource& src,
bool create(const ProgramSource2& src,
const String& buildflags, String& errmsg);
bool read(const String& buf, const String& buildflags);
bool write(String& buf) const;
const ProgramSource& source() const;
const ProgramSource2& source() const;
void* ptr() const;
String getPrefix() const;
@ -426,17 +441,17 @@ protected:
};
class CV_EXPORTS ProgramSource
class CV_EXPORTS ProgramSource2
{
public:
typedef uint64 hash_t;
ProgramSource();
explicit ProgramSource(const String& prog);
explicit ProgramSource(const char* prog);
~ProgramSource();
ProgramSource(const ProgramSource& prog);
ProgramSource& operator = (const ProgramSource& prog);
ProgramSource2();
explicit ProgramSource2(const String& prog);
explicit ProgramSource2(const char* prog);
~ProgramSource2();
ProgramSource2(const ProgramSource2& prog);
ProgramSource2& operator = (const ProgramSource2& prog);
const String& source() const;
hash_t hash() const;
@ -446,6 +461,10 @@ protected:
Impl* p;
};
CV_EXPORTS const char* convertTypeStr(int sdepth, int ddepth, int cn, char* buf);
CV_EXPORTS const char* typeToStr(int t);
CV_EXPORTS const char* memopTypeToStr(int t);
}}
#endif

@ -0,0 +1,60 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the OpenCV Foundation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef __OPENCV_OPENCL_GENBASE_HPP__
#define __OPENCV_OPENCL_GENBASE_HPP__
namespace cv
{
namespace ocl
{
struct ProgramEntry
{
const char* name;
const char* programStr;
const char* programHash;
};
}
}
#endif

@ -911,33 +911,112 @@ void convertAndUnrollScalar( const Mat& sc, int buftype, uchar* scbuf, size_t bl
scbuf[i] = scbuf[i - esz];
}
static void binary_op(InputArray _src1, InputArray _src2, OutputArray _dst,
InputArray _mask, const BinaryFunc* tab, bool bitwise)
enum { OCL_OP_ADD=0, OCL_OP_SUB=1, OCL_OP_RSUB=2, OCL_OP_ABSDIFF=3, OCL_OP_MUL=4,
OCL_OP_MUL_SCALE=5, OCL_OP_DIV_SCALE=6, OCL_OP_RECIP_SCALE=7, OCL_OP_ADDW=8,
OCL_OP_AND=9, OCL_OP_OR=10, OCL_OP_XOR=11, OCL_OP_NOT=12, OCL_OP_MIN=13, OCL_OP_MAX=14 };
static const char* oclop2str[] = { "OP_ADD", "OP_SUB", "OP_RSUB", "OP_ABSDIFF",
"OP_MUL", "OP_MUL_SCALE", "OP_DIV_SCALE", "OP_RECIP_SCALE",
"OP_ADDW", "OP_AND", "OP_OR", "OP_XOR", "OP_NOT", "OP_MIN", "OP_MAX", 0 };
static bool ocl_binary_op(InputArray _src1, InputArray _src2, OutputArray _dst,
InputArray _mask, bool bitwise, int oclop, bool haveScalar )
{
int kind1 = _src1.kind(), kind2 = _src2.kind();
Mat src1 = _src1.getMat(), src2 = _src2.getMat();
bool haveMask = !_mask.empty();
int srctype = _src1.type();
int srcdepth = CV_MAT_DEPTH(srctype);
int cn = CV_MAT_CN(srctype);
if( oclop < 0 || ((haveMask || haveScalar) && cn > 4) )
return false;
UMat src1 = _src1.getUMat(), src2;
UMat dst = _dst.getUMat(), mask = _mask.getUMat();
char opts[1024];
int kercn = haveMask || haveScalar ? cn : 1;
sprintf(opts, "-D %s%s -D %s -D dstT=%s",
(haveMask ? "MASK_" : ""), (haveScalar ? "UNARY_OP" : "BINARY_OP"), oclop2str[oclop],
bitwise ? ocl::memopTypeToStr(CV_MAKETYPE(srcdepth, kercn)) :
ocl::typeToStr(CV_MAKETYPE(srcdepth, kercn)));
ocl::Kernel k("KF", ocl::core::arithm_oclsrc, opts);
if( k.empty() )
return false;
int cscale = cn/kercn;
ocl::KernelArg src1arg = ocl::KernelArg::ReadOnlyNoSize(src1, cscale);
ocl::KernelArg dstarg = haveMask ? ocl::KernelArg::ReadWrite(dst, cscale) :
ocl::KernelArg::WriteOnly(dst, cscale);
ocl::KernelArg maskarg = ocl::KernelArg::ReadOnlyNoSize(mask, 1);
if( haveScalar )
{
size_t esz = CV_ELEM_SIZE(srctype);
double buf[4] = {0,0,0,0};
if( oclop != OCL_OP_NOT )
{
Mat src2sc = _src2.getMat();
convertAndUnrollScalar(src2sc, srctype, (uchar*)buf, 1);
}
ocl::KernelArg scalararg = ocl::KernelArg(0, 0, 0, buf, esz);
if( !haveMask )
k.args(src1arg, dstarg, scalararg);
else
k.args(src1arg, maskarg, dstarg, scalararg);
}
else
{
src2 = _src2.getUMat();
ocl::KernelArg src2arg = ocl::KernelArg::ReadOnlyNoSize(src2, cscale);
if( !haveMask )
k.args(src1arg, src2arg, dstarg);
else
k.args(src1arg, src2arg, maskarg, dstarg);
}
size_t globalsize[] = { src1.cols*(cn/kercn), src1.rows };
return k.run(2, globalsize, 0, false);
}
static void binary_op( InputArray _src1, InputArray _src2, OutputArray _dst,
InputArray _mask, const BinaryFunc* tab,
bool bitwise, int oclop )
{
const _InputArray *psrc1 = &_src1, *psrc2 = &_src2;
int kind1 = psrc1->kind(), kind2 = psrc2->kind();
int type1 = psrc1->type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1);
int type2 = psrc2->type(), depth2 = CV_MAT_DEPTH(type2), cn2 = CV_MAT_CN(type2);
int dims1 = psrc1->dims(), dims2 = psrc2->dims();
Size sz1 = dims1 <= 2 ? psrc1->size() : Size();
Size sz2 = dims2 <= 2 ? psrc2->size() : Size();
bool use_opencl = (kind1 == _InputArray::UMAT || kind2 == _InputArray::UMAT) &&
ocl::useOpenCL() && dims1 <= 2 && dims2 <= 2;
bool haveMask = !_mask.empty(), haveScalar = false;
BinaryFunc func;
int c;
if( src1.dims <= 2 && src2.dims <= 2 && kind1 == kind2 &&
src1.size() == src2.size() && src1.type() == src2.type() && !haveMask )
if( dims1 <= 2 && dims2 <= 2 && kind1 == kind2 && sz1 == sz2 && type1 == type2 && !haveMask )
{
_dst.create(src1.size(), src1.type());
Mat dst = _dst.getMat();
_dst.create(sz1, type1);
if( use_opencl && ocl_binary_op(*psrc1, *psrc2, _dst, _mask, bitwise, oclop, false) )
return;
if( bitwise )
{
func = *tab;
c = (int)src1.elemSize();
cn = (int)CV_ELEM_SIZE(type1);
}
else
{
func = tab[src1.depth()];
c = src1.channels();
}
func = tab[depth1];
Mat src1 = psrc1->getMat(), src2 = psrc2->getMat(), dst = _dst.getMat();
Size sz = getContinuousSize(src1, src2, dst);
size_t len = sz.width*(size_t)c;
size_t len = sz.width*(size_t)cn;
if( len == (size_t)(int)len )
{
sz.width = (int)len;
@ -946,56 +1025,67 @@ static void binary_op(InputArray _src1, InputArray _src2, OutputArray _dst,
}
}
if( (kind1 == _InputArray::MATX) + (kind2 == _InputArray::MATX) == 1 ||
src1.size != src2.size || src1.type() != src2.type() )
if( oclop == OCL_OP_NOT )
haveScalar = true;
else if( (kind1 == _InputArray::MATX) + (kind2 == _InputArray::MATX) == 1 ||
!psrc1->sameSize(*psrc2) || type1 != type2 )
{
if( checkScalar(src1, src2.type(), kind1, kind2) )
if( checkScalar(*psrc1, type2, kind1, kind2) )
{
// src1 is a scalar; swap it with src2
swap(src1, src2);
else if( !checkScalar(src2, src1.type(), kind2, kind1) )
swap(psrc1, psrc2);
swap(type1, type2);
swap(depth1, depth2);
swap(cn, cn2);
swap(sz1, sz2);
}
else if( !checkScalar(*psrc2, type1, kind2, kind1) )
CV_Error( CV_StsUnmatchedSizes,
"The operation is neither 'array op array' (where arrays have the same size and type), "
"nor 'array op scalar', nor 'scalar op array'" );
haveScalar = true;
}
else
{
CV_Assert( psrc1->sameSize(*psrc2) && type1 == type2 );
}
size_t esz = src1.elemSize();
size_t esz = CV_ELEM_SIZE(type1);
size_t blocksize0 = (BLOCK_SIZE + esz-1)/esz;
int cn = src1.channels();
BinaryFunc copymask = 0;
Mat mask;
bool reallocate = false;
if( haveMask )
{
mask = _mask.getMat();
CV_Assert( (mask.type() == CV_8UC1 || mask.type() == CV_8SC1) );
CV_Assert( mask.size == src1.size );
int mtype = _mask.type();
CV_Assert( (mtype == CV_8U || mtype == CV_8S) && _mask.sameSize(*psrc1));
copymask = getCopyMaskFunc(esz);
Mat tdst = _dst.getMat();
reallocate = tdst.size != src1.size || tdst.type() != src1.type();
reallocate = !_dst.sameSize(*psrc1) || _dst.type() != type1;
}
AutoBuffer<uchar> _buf;
uchar *scbuf = 0, *maskbuf = 0;
_dst.create(src1.dims, src1.size, src1.type());
Mat dst = _dst.getMat();
_dst.createSameSize(*psrc1, type1);
// if this is mask operation and dst has been reallocated,
// we have to
// we have to clear the destination
if( haveMask && reallocate )
dst = Scalar::all(0);
_dst.setTo(0.);
if( use_opencl && ocl_binary_op(*psrc1, *psrc2, _dst, _mask, bitwise, oclop, haveScalar ))
return;
Mat src1 = psrc1->getMat(), src2 = psrc2->getMat();
Mat dst = _dst.getMat(), mask = _mask.getMat();
if( bitwise )
{
func = *tab;
c = (int)esz;
cn = (int)esz;
}
else
{
func = tab[src1.depth()];
c = cn;
func = tab[depth1];
}
if( !haveScalar )
@ -1006,8 +1096,8 @@ static void binary_op(InputArray _src1, InputArray _src2, OutputArray _dst,
NAryMatIterator it(arrays, ptrs);
size_t total = it.size, blocksize = total;
if( blocksize*c > INT_MAX )
blocksize = INT_MAX/c;
if( blocksize*cn > INT_MAX )
blocksize = INT_MAX/cn;
if( haveMask )
{
@ -1022,7 +1112,7 @@ static void binary_op(InputArray _src1, InputArray _src2, OutputArray _dst,
{
int bsz = (int)MIN(total - j, blocksize);
func( ptrs[0], 0, ptrs[1], 0, haveMask ? maskbuf : ptrs[2], 0, Size(bsz*c, 1), 0 );
func( ptrs[0], 0, ptrs[1], 0, haveMask ? maskbuf : ptrs[2], 0, Size(bsz*cn, 1), 0 );
if( haveMask )
{
copymask( maskbuf, 0, ptrs[3], 0, ptrs[2], 0, Size(bsz, 1), &esz );
@ -1054,7 +1144,7 @@ static void binary_op(InputArray _src1, InputArray _src2, OutputArray _dst,
{
int bsz = (int)MIN(total - j, blocksize);
func( ptrs[0], 0, scbuf, 0, haveMask ? maskbuf : ptrs[1], 0, Size(bsz*c, 1), 0 );
func( ptrs[0], 0, scbuf, 0, haveMask ? maskbuf : ptrs[1], 0, Size(bsz*cn, 1), 0 );
if( haveMask )
{
copymask( maskbuf, 0, ptrs[2], 0, ptrs[1], 0, Size(bsz, 1), &esz );
@ -1101,47 +1191,59 @@ static BinaryFunc* getMinTab()
void cv::bitwise_and(InputArray a, InputArray b, OutputArray c, InputArray mask)
{
BinaryFunc f = (BinaryFunc)GET_OPTIMIZED(and8u);
binary_op(a, b, c, mask, &f, true);
binary_op(a, b, c, mask, &f, true, OCL_OP_AND);
}
void cv::bitwise_or(InputArray a, InputArray b, OutputArray c, InputArray mask)
{
BinaryFunc f = (BinaryFunc)GET_OPTIMIZED(or8u);
binary_op(a, b, c, mask, &f, true);
binary_op(a, b, c, mask, &f, true, OCL_OP_OR);
}
void cv::bitwise_xor(InputArray a, InputArray b, OutputArray c, InputArray mask)
{
BinaryFunc f = (BinaryFunc)GET_OPTIMIZED(xor8u);
binary_op(a, b, c, mask, &f, true);
binary_op(a, b, c, mask, &f, true, OCL_OP_XOR);
}
void cv::bitwise_not(InputArray a, OutputArray c, InputArray mask)
{
BinaryFunc f = (BinaryFunc)GET_OPTIMIZED(not8u);
binary_op(a, a, c, mask, &f, true);
binary_op(a, a, c, mask, &f, true, OCL_OP_NOT);
}
void cv::max( InputArray src1, InputArray src2, OutputArray dst )
{
binary_op(src1, src2, dst, noArray(), getMaxTab(), false );
binary_op(src1, src2, dst, noArray(), getMaxTab(), false, OCL_OP_MAX );
}
void cv::min( InputArray src1, InputArray src2, OutputArray dst )
{
binary_op(src1, src2, dst, noArray(), getMinTab(), false );
binary_op(src1, src2, dst, noArray(), getMinTab(), false, OCL_OP_MIN );
}
void cv::max(const Mat& src1, const Mat& src2, Mat& dst)
{
OutputArray _dst(dst);
binary_op(src1, src2, _dst, noArray(), getMaxTab(), false );
binary_op(src1, src2, _dst, noArray(), getMaxTab(), false, OCL_OP_MAX );
}
void cv::min(const Mat& src1, const Mat& src2, Mat& dst)
{
OutputArray _dst(dst);
binary_op(src1, src2, _dst, noArray(), getMinTab(), false );
binary_op(src1, src2, _dst, noArray(), getMinTab(), false, OCL_OP_MIN );
}
void cv::max(const UMat& src1, const UMat& src2, UMat& dst)
{
OutputArray _dst(dst);
binary_op(src1, src2, _dst, noArray(), getMaxTab(), false, OCL_OP_MAX );
}
void cv::min(const UMat& src1, const UMat& src2, UMat& dst)
{
OutputArray _dst(dst);
binary_op(src1, src2, _dst, noArray(), getMinTab(), false, OCL_OP_MIN );
}
@ -1171,73 +1273,213 @@ static int actualScalarDepth(const double* data, int len)
CV_32S;
}
static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
InputArray _mask, int dtype, BinaryFunc* tab, bool muldiv=false, void* usrdata=0)
static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
InputArray _mask, int wtype,
void* usrdata, int oclop,
bool haveScalar )
{
int kind1 = _src1.kind(), kind2 = _src2.kind();
Mat src1 = _src1.getMat(), src2 = _src2.getMat();
int type1 = _src1.type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1);
bool haveMask = !_mask.empty();
bool reallocate = false;
bool src1Scalar = checkScalar(src1, src2.type(), kind1, kind2);
bool src2Scalar = checkScalar(src2, src1.type(), kind2, kind1);
if( (haveMask || haveScalar) && cn > 4 )
return false;
if( (kind1 == kind2 || src1.channels() == 1) && src1.dims <= 2 && src2.dims <= 2 &&
src1.size() == src2.size() && src1.type() == src2.type() &&
!haveMask && ((!_dst.fixedType() && (dtype < 0 || CV_MAT_DEPTH(dtype) == src1.depth())) ||
(_dst.fixedType() && _dst.type() == _src1.type())) &&
int dtype = _dst.type(), ddepth = CV_MAT_DEPTH(dtype), wdepth = CV_MAT_DEPTH(wtype);
wtype = CV_MAKETYPE(wdepth, cn);
int type2 = haveScalar ? _src2.type() : wtype, depth2 = CV_MAT_DEPTH(type2);
UMat src1 = _src1.getUMat(), src2;
UMat dst = _dst.getUMat(), mask = _mask.getUMat();
char opts[1024];
int kercn = haveMask || haveScalar ? cn : 1;
if( (depth1 == depth2 || haveScalar) && ddepth == depth1 && wdepth == depth1 )
{
const char* oclopstr = oclop2str[oclop];
if( wdepth <= CV_16S )
{
oclopstr = oclop == OCL_OP_ADD ? "OCL_OP_ADD_SAT" :
oclop == OCL_OP_SUB ? "OCL_OP_SUB_SAT" :
oclop == OCL_OP_RSUB ? "OCL_OP_RSUB_SAT" : oclopstr;
}
sprintf(opts, "-D %s%s -D %s -D dstT=%s",
(haveMask ? "MASK_" : ""), (haveScalar ? "UNARY_OP" : "BINARY_OP"),
oclop2str[oclop], ocl::typeToStr(CV_MAKETYPE(ddepth, kercn)));
}
else
{
char cvtstr[3][32];
sprintf(opts, "-D %s%s -D %s -D srcT1=%s -D srcT2=%s "
"-D dstT=%s -D workT=%s -D convertToWT1=%s "
"-D convertToWT2=%s -D convertToDT=%s",
(haveMask ? "MASK_" : ""), (haveScalar ? "UNARY_OP" : "BINARY_OP"),
oclop2str[oclop], ocl::typeToStr(CV_MAKETYPE(depth1, kercn)),
ocl::typeToStr(CV_MAKETYPE(depth2, kercn)),
ocl::typeToStr(CV_MAKETYPE(ddepth, kercn)),
ocl::typeToStr(CV_MAKETYPE(wdepth, kercn)),
ocl::convertTypeStr(depth1, wdepth, kercn, cvtstr[0]),
ocl::convertTypeStr(depth2, wdepth, kercn, cvtstr[1]),
ocl::convertTypeStr(wdepth, ddepth, kercn, cvtstr[2]));
}
const uchar* usrdata_p = (const uchar*)usrdata;
const double* usrdata_d = (const double*)usrdata;
float usrdata_f[3];
int i, n = oclop == OCL_OP_MUL_SCALE || oclop == OCL_OP_DIV_SCALE ||
oclop == OCL_OP_RECIP_SCALE ? 1 : oclop == OCL_OP_ADDW ? 3 : 0;
if( n > 0 && wdepth == CV_32F )
{
for( i = 0; i < n; i++ )
usrdata_f[i] = (float)usrdata_d[i];
usrdata_p = (const uchar*)usrdata_f;
}
size_t usrdata_esz = CV_ELEM_SIZE(wdepth);
ocl::Kernel k("KF", ocl::core::arithm_oclsrc, opts);
if( k.empty() )
return false;
int cscale = cn/kercn;
ocl::KernelArg src1arg = ocl::KernelArg::ReadOnlyNoSize(src1, cscale);
ocl::KernelArg dstarg = haveMask ? ocl::KernelArg::ReadWrite(dst, cscale) :
ocl::KernelArg::WriteOnly(dst, cscale);
ocl::KernelArg maskarg = ocl::KernelArg::ReadOnlyNoSize(mask, 1);
if( haveScalar )
{
size_t esz = CV_ELEM_SIZE(wtype);
double buf[4]={0,0,0,0};
Mat src2sc = _src2.getMat();
if( !src2sc.empty() )
{
convertAndUnrollScalar(src2sc, wtype, (uchar*)buf, 1);
}
ocl::KernelArg scalararg = ocl::KernelArg(0, 0, 0, buf, esz);
if( !haveMask )
k.args(src1arg, dstarg, scalararg);
else
k.args(src1arg, maskarg, dstarg, scalararg);
}
else
{
src2 = _src2.getUMat();
ocl::KernelArg src2arg = ocl::KernelArg::ReadOnlyNoSize(src2, cscale);
if( !haveMask )
{
if(n == 0)
k.args(src1arg, src2arg, dstarg);
else if(n == 1)
k.args(src1arg, src2arg, dstarg,
ocl::KernelArg(0, 0, 0, usrdata_p, usrdata_esz));
else if(n == 3)
k.args(src1arg, src2arg, dstarg,
ocl::KernelArg(0, 0, 0, usrdata_p, usrdata_esz),
ocl::KernelArg(0, 0, 0, usrdata_p + usrdata_esz, usrdata_esz),
ocl::KernelArg(0, 0, 0, usrdata_p + usrdata_esz*2, usrdata_esz));
else
CV_Error(Error::StsNotImplemented, "unsupported number of extra parameters");
}
else
{
k.args(src1arg, src2arg, maskarg, dstarg);
}
}
size_t globalsize[] = { src1.cols*(cn/kercn), src1.rows };
return k.run(2, globalsize, 0, false);
}
static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
InputArray _mask, int dtype, BinaryFunc* tab, bool muldiv=false,
void* usrdata=0, int oclop=-1 )
{
const _InputArray *psrc1 = &_src1, *psrc2 = &_src2;
int kind1 = psrc1->kind(), kind2 = psrc2->kind();
bool haveMask = !_mask.empty();
bool reallocate = false;
int type1 = psrc1->type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1);
int type2 = psrc2->type(), depth2 = CV_MAT_DEPTH(type2), cn2 = CV_MAT_CN(type2);
int wtype, dims1 = psrc1->dims(), dims2 = psrc2->dims();
Size sz1 = dims1 <= 2 ? psrc1->size() : Size();
Size sz2 = dims2 <= 2 ? psrc2->size() : Size();
bool use_opencl = (kind1 == _InputArray::UMAT || kind2 == _InputArray::UMAT) &&
ocl::useOpenCL() && dims1 <= 2 && dims2 <= 2;
bool src1Scalar = checkScalar(*psrc1, type2, kind1, kind2);
bool src2Scalar = checkScalar(*psrc2, type1, kind2, kind1);
if( (kind1 == kind2 || cn == 1) && sz1 == sz2 && dims1 <= 2 && dims2 <= 2 && type1 == type2 &&
!haveMask && ((!_dst.fixedType() && (dtype < 0 || CV_MAT_DEPTH(dtype) == depth1)) ||
(_dst.fixedType() && _dst.type() == type1)) &&
((src1Scalar && src2Scalar) || (!src1Scalar && !src2Scalar)) )
{
_dst.create(src1.size(), src1.type());
Mat dst = _dst.getMat();
_dst.createSameSize(*psrc1, type1);
if( use_opencl &&
ocl_arithm_op(*psrc1, *psrc2, _dst, _mask,
(!usrdata ? type1 : std::max(depth1, CV_32F)),
usrdata, oclop, false))
return;
Mat src1 = psrc1->getMat(), src2 = psrc2->getMat(), dst = _dst.getMat();
Size sz = getContinuousSize(src1, src2, dst, src1.channels());
tab[src1.depth()](src1.data, src1.step, src2.data, src2.step, dst.data, dst.step, sz, usrdata);
tab[depth1](src1.data, src1.step, src2.data, src2.step, dst.data, dst.step, sz, usrdata);
return;
}
bool haveScalar = false, swapped12 = false;
int depth2 = src2.depth();
if( src1.size != src2.size || src1.channels() != src2.channels() ||
if( dims1 != dims2 || sz1 != sz2 || cn != cn2 ||
((kind1 == _InputArray::MATX || kind2 == _InputArray::MATX) &&
src1.cols == 1 && src2.rows == 4) )
(sz1 == Size(1,4) || sz2 == Size(1,4))) )
{
if( checkScalar(src1, src2.type(), kind1, kind2) )
if( checkScalar(*psrc1, type2, kind1, kind2) )
{
// src1 is a scalar; swap it with src2
swap(src1, src2);
swap(psrc1, psrc2);
swap(sz1, sz2);
swap(type1, type2);
swap(depth1, depth2);
swap(cn, cn2);
swap(dims1, dims2);
swapped12 = true;
if( oclop == OCL_OP_SUB )
oclop = OCL_OP_RSUB;
}
else if( !checkScalar(src2, src1.type(), kind2, kind1) )
else if( !checkScalar(*psrc2, type1, kind2, kind1) )
CV_Error( CV_StsUnmatchedSizes,
"The operation is neither 'array op array' (where arrays have the same size and the same number of channels), "
"The operation is neither 'array op array' "
"(where arrays have the same size and the same number of channels), "
"nor 'array op scalar', nor 'scalar op array'" );
haveScalar = true;
CV_Assert(src2.type() == CV_64F && (src2.rows == 4 || src2.rows == 1));
CV_Assert(type2 == CV_64F && (sz2.height == 1 || sz2.height == 4));
if (!muldiv)
{
depth2 = actualScalarDepth(src2.ptr<double>(), src1.channels());
if( depth2 == CV_64F && (src1.depth() < CV_32S || src1.depth() == CV_32F) )
Mat sc = psrc2->getMat();
depth2 = actualScalarDepth(sc.ptr<double>(), cn);
if( depth2 == CV_64F && (depth1 < CV_32S || depth1 == CV_32F) )
depth2 = CV_32F;
}
else
depth2 = CV_64F;
}
int cn = src1.channels(), depth1 = src1.depth(), wtype;
BinaryFunc cvtsrc1 = 0, cvtsrc2 = 0, cvtdst = 0;
if( dtype < 0 )
{
if( _dst.fixedType() )
dtype = _dst.type();
else
{
if( !haveScalar && src1.type() != src2.type() )
if( !haveScalar && type1 != type2 )
CV_Error(CV_StsBadArg,
"When the input arrays in add/subtract/multiply/divide functions have different types, "
"the output array type must be explicitly specified");
dtype = src1.type();
dtype = type1;
}
}
dtype = CV_MAT_DEPTH(dtype);
@ -1262,39 +1504,41 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
wtype = std::max(wtype, dtype);
}
cvtsrc1 = depth1 == wtype ? 0 : getConvertFunc(depth1, wtype);
cvtsrc2 = depth2 == depth1 ? cvtsrc1 : depth2 == wtype ? 0 : getConvertFunc(depth2, wtype);
cvtdst = dtype == wtype ? 0 : getConvertFunc(wtype, dtype);
dtype = CV_MAKETYPE(dtype, cn);
wtype = CV_MAKETYPE(wtype, cn);
size_t esz1 = src1.elemSize(), esz2 = src2.elemSize();
size_t dsz = CV_ELEM_SIZE(dtype), wsz = CV_ELEM_SIZE(wtype);
size_t blocksize0 = (size_t)(BLOCK_SIZE + wsz-1)/wsz;
BinaryFunc copymask = 0;
Mat mask;
if( haveMask )
{
mask = _mask.getMat();
CV_Assert( (mask.type() == CV_8UC1 || mask.type() == CV_8SC1) );
CV_Assert( mask.size == src1.size );
copymask = getCopyMaskFunc(dsz);
Mat tdst = _dst.getMat();
reallocate = tdst.size != src1.size || tdst.type() != dtype;
int mtype = _mask.type();
CV_Assert( (mtype == CV_8UC1 || mtype == CV_8SC1) && _mask.sameSize(*psrc1) );
reallocate = !_dst.sameSize(*psrc1) || _dst.type() != dtype;
}
AutoBuffer<uchar> _buf;
uchar *buf, *maskbuf = 0, *buf1 = 0, *buf2 = 0, *wbuf = 0;
size_t bufesz = (cvtsrc1 ? wsz : 0) + (cvtsrc2 || haveScalar ? wsz : 0) + (cvtdst ? wsz : 0) + (haveMask ? dsz : 0);
_dst.createSameSize(*psrc1, dtype);
if( reallocate )
_dst.setTo(0.);
_dst.create(src1.dims, src1.size, dtype);
Mat dst = _dst.getMat();
if( use_opencl &&
ocl_arithm_op(*psrc1, *psrc2, _dst, _mask, wtype,
usrdata, oclop, haveScalar))
return;
if( haveMask && reallocate )
dst = Scalar::all(0);
BinaryFunc cvtsrc1 = type1 == wtype ? 0 : getConvertFunc(type1, wtype);
BinaryFunc cvtsrc2 = type2 == type1 ? cvtsrc1 : type2 == wtype ? 0 : getConvertFunc(type2, wtype);
BinaryFunc cvtdst = dtype == wtype ? 0 : getConvertFunc(wtype, dtype);
size_t esz1 = CV_ELEM_SIZE(type1), esz2 = CV_ELEM_SIZE(type2);
size_t dsz = CV_ELEM_SIZE(dtype), wsz = CV_ELEM_SIZE(wtype);
size_t blocksize0 = (size_t)(BLOCK_SIZE + wsz-1)/wsz;
BinaryFunc copymask = getCopyMaskFunc(dsz);
Mat src1 = psrc1->getMat(), src2 = psrc2->getMat(), dst = _dst.getMat(), mask = _mask.getMat();
AutoBuffer<uchar> _buf;
uchar *buf, *maskbuf = 0, *buf1 = 0, *buf2 = 0, *wbuf = 0;
size_t bufesz = (cvtsrc1 ? wsz : 0) +
(cvtsrc2 || haveScalar ? wsz : 0) +
(cvtdst ? wsz : 0) +
(haveMask ? dsz : 0);
BinaryFunc func = tab[CV_MAT_DEPTH(wtype)];
if( !haveScalar )
@ -1476,7 +1720,7 @@ static BinaryFunc* getAbsDiffTab()
void cv::add( InputArray src1, InputArray src2, OutputArray dst,
InputArray mask, int dtype )
{
arithm_op(src1, src2, dst, mask, dtype, getAddTab() );
arithm_op(src1, src2, dst, mask, dtype, getAddTab(), false, 0, OCL_OP_ADD );
}
void cv::subtract( InputArray src1, InputArray src2, OutputArray dst,
@ -1511,12 +1755,12 @@ void cv::subtract( InputArray src1, InputArray src2, OutputArray dst,
}
}
#endif
arithm_op(src1, src2, dst, mask, dtype, getSubTab() );
arithm_op(src1, src2, dst, mask, dtype, getSubTab(), false, 0, OCL_OP_SUB );
}
void cv::absdiff( InputArray src1, InputArray src2, OutputArray dst )
{
arithm_op(src1, src2, dst, noArray(), -1, getAbsDiffTab());
arithm_op(src1, src2, dst, noArray(), -1, getAbsDiffTab(), false, 0, OCL_OP_ABSDIFF);
}
/****************************************************************************************\
@ -1847,19 +2091,20 @@ static BinaryFunc* getRecipTab()
void cv::multiply(InputArray src1, InputArray src2,
OutputArray dst, double scale, int dtype)
{
arithm_op(src1, src2, dst, noArray(), dtype, getMulTab(), true, &scale);
arithm_op(src1, src2, dst, noArray(), dtype, getMulTab(),
true, &scale, scale == 1. ? OCL_OP_MUL : OCL_OP_MUL_SCALE);
}
void cv::divide(InputArray src1, InputArray src2,
OutputArray dst, double scale, int dtype)
{
arithm_op(src1, src2, dst, noArray(), dtype, getDivTab(), true, &scale);
arithm_op(src1, src2, dst, noArray(), dtype, getDivTab(), true, &scale, OCL_OP_DIV_SCALE);
}
void cv::divide(double scale, InputArray src2,
OutputArray dst, int dtype)
{
arithm_op(src2, src2, dst, noArray(), dtype, getRecipTab(), true, &scale);
arithm_op(src2, src2, dst, noArray(), dtype, getRecipTab(), true, &scale, OCL_OP_RECIP_SCALE);
}
/****************************************************************************************\
@ -2020,7 +2265,7 @@ void cv::addWeighted( InputArray src1, double alpha, InputArray src2,
double beta, double gamma, OutputArray dst, int dtype )
{
double scalars[] = {alpha, beta, gamma};
arithm_op(src1, src2, dst, noArray(), dtype, getAddWeightedTab(), true, scalars);
arithm_op(src1, src2, dst, noArray(), dtype, getAddWeightedTab(), true, scalars, OCL_OP_ADDW);
}

@ -220,6 +220,21 @@ void Mat::copyTo( OutputArray _dst ) const
return;
}
if( _dst.isUMat() )
{
_dst.create( dims, size.p, type() );
UMat dst = _dst.getUMat();
size_t i, sz[CV_MAX_DIM], dstofs[CV_MAX_DIM], esz = elemSize();
for( i = 0; i < (size_t)dims; i++ )
sz[i] = size.p[i];
sz[dims-1] *= esz;
dst.ndoffset(dstofs);
dstofs[dims-1] *= esz;
dst.u->currAllocator->upload(dst.u, data, dims, sz, dstofs, dst.step.p, step.p);
return;
}
if( dims <= 2 )
{
_dst.create( rows, cols, type() );

@ -1436,6 +1436,181 @@ Size _InputArray::size(int i) const
}
}
int _InputArray::sizend(int* sz, int i) const
{
int j, d=0, k = kind();
if( k == NONE )
;
else if( k == MAT )
{
CV_Assert( i < 0 );
const Mat& m = *(const Mat*)obj;
d = m.dims;
if(sz)
for(j = 0; j < d; j++)
sz[j] = m.size.p[j];
}
else if( k == UMAT )
{
CV_Assert( i < 0 );
const UMat& m = *(const UMat*)obj;
d = m.dims;
if(sz)
for(j = 0; j < d; j++)
sz[j] = m.size.p[j];
}
else if( k == STD_VECTOR_MAT && i >= 0 )
{
const std::vector<Mat>& vv = *(const std::vector<Mat>*)obj;
CV_Assert( i < (int)vv.size() );
const Mat& m = vv[i];
d = m.dims;
if(sz)
for(j = 0; j < d; j++)
sz[j] = m.size.p[j];
}
else if( k == STD_VECTOR_UMAT && i >= 0 )
{
const std::vector<UMat>& vv = *(const std::vector<UMat>*)obj;
CV_Assert( i < (int)vv.size() );
const UMat& m = vv[i];
d = m.dims;
if(sz)
for(j = 0; j < d; j++)
sz[j] = m.size.p[j];
}
else
{
Size sz2d = size(i);
d = 2;
if(sz)
{
sz[0] = sz2d.height;
sz[1] = sz2d.width;
}
}
return d;
}
bool _InputArray::sameSize(const _InputArray& arr) const
{
int k1 = kind(), k2 = arr.kind();
Size sz1;
if( k1 == MAT )
{
const Mat* m = ((const Mat*)obj);
if( k2 == MAT )
return m->size == ((const Mat*)arr.obj)->size;
if( k2 == UMAT )
return m->size == ((const UMat*)arr.obj)->size;
if( m->dims > 2 )
return false;
sz1 = m->size();
}
else if( k1 == UMAT )
{
const UMat* m = ((const UMat*)obj);
if( k2 == MAT )
return m->size == ((const Mat*)arr.obj)->size;
if( k2 == UMAT )
return m->size == ((const UMat*)arr.obj)->size;
if( m->dims > 2 )
return false;
sz1 = m->size();
}
else
sz1 = size();
if( arr.dims() > 2 )
return false;
return sz1 == arr.size();
}
int _InputArray::dims(int i) const
{
int k = kind();
if( k == MAT )
{
CV_Assert( i < 0 );
return ((const Mat*)obj)->dims;
}
if( k == EXPR )
{
CV_Assert( i < 0 );
return ((const MatExpr*)obj)->a.dims;
}
if( k == UMAT )
{
CV_Assert( i < 0 );
return ((const UMat*)obj)->dims;
}
if( k == MATX )
{
CV_Assert( i < 0 );
return 2;
}
if( k == STD_VECTOR )
{
CV_Assert( i < 0 );
return 2;
}
if( k == NONE )
return 0;
if( k == STD_VECTOR_VECTOR )
{
const std::vector<std::vector<uchar> >& vv = *(const std::vector<std::vector<uchar> >*)obj;
if( i < 0 )
return 1;
CV_Assert( i < (int)vv.size() );
return 2;
}
if( k == STD_VECTOR_MAT )
{
const std::vector<Mat>& vv = *(const std::vector<Mat>*)obj;
if( i < 0 )
return 1;
CV_Assert( i < (int)vv.size() );
return vv[i].dims;
}
if( k == OPENGL_BUFFER )
{
CV_Assert( i < 0 );
return 2;
}
if( k == GPU_MAT )
{
CV_Assert( i < 0 );
return 2;
}
if( k == OCL_MAT )
{
return 2;
}
CV_Assert( k == CUDA_MEM );
//if( k == CUDA_MEM )
{
CV_Assert( i < 0 );
return 2;
}
}
size_t _InputArray::total(int i) const
{
int k = kind();
@ -1570,6 +1745,61 @@ bool _InputArray::empty() const
return ((const cuda::CudaMem*)obj)->empty();
}
bool _InputArray::isContinuous(int i) const
{
int k = kind();
if( k == MAT )
return i < 0 ? ((const Mat*)obj)->isContinuous() : true;
if( k == UMAT )
return i < 0 ? ((const UMat*)obj)->isContinuous() : true;
if( k == EXPR || k == MATX || k == STD_VECTOR || k == NONE || k == STD_VECTOR_VECTOR)
return true;
if( k == STD_VECTOR_MAT )
{
const std::vector<Mat>& vv = *(const std::vector<Mat>*)obj;
CV_Assert((size_t)i < vv.size());
return vv[i].isContinuous();
}
if( k == STD_VECTOR_UMAT )
{
const std::vector<UMat>& vv = *(const std::vector<UMat>*)obj;
CV_Assert((size_t)i < vv.size());
return vv[i].isContinuous();
}
CV_Error(CV_StsNotImplemented, "This method is not implemented for oclMat yet");
return false;
}
void _InputArray::copyTo(const _OutputArray& arr) const
{
int k = kind();
if( k == NONE )
arr.release();
else if( k == MAT || k == MATX || k == STD_VECTOR )
{
Mat m = getMat();
m.copyTo(arr);
}
else if( k == EXPR )
{
const MatExpr& e = *((MatExpr*)obj);
if( arr.kind() == MAT )
arr.getMatRef() = e;
else
Mat(e).copyTo(arr);
}
else if( k == UMAT )
((UMat*)obj)->copyTo(arr);
else
CV_Error(Error::StsNotImplemented, "");
}
bool _OutputArray::fixedSize() const
{
@ -1899,6 +2129,12 @@ void _OutputArray::create(int dims, const int* sizes, int mtype, int i,
CV_Error(Error::StsNotImplemented, "Unknown/unsupported array type");
}
void _OutputArray::createSameSize(const _InputArray& arr, int mtype) const
{
int sz[CV_MAX_DIM], d = arr.sizend(sz);
create(d, sz, mtype);
}
void _OutputArray::release() const
{
CV_Assert(!fixedSize());
@ -2010,6 +2246,23 @@ cuda::CudaMem& _OutputArray::getCudaMemRef() const
return *(cuda::CudaMem*)obj;
}
void _OutputArray::setTo(const _InputArray& arr) const
{
int k = kind();
if( k == NONE )
;
else if( k == MAT || k == MATX || k == STD_VECTOR )
{
Mat m = getMat();
m.setTo(arr);
}
else if( k == UMAT )
((UMat*)obj)->setTo(arr);
else
CV_Error(Error::StsNotImplemented, "");
}
static _InputOutputArray _none;
InputOutputArray noArray() { return _none; }

@ -592,9 +592,16 @@ static void* initOpenCLAndLoad(const char* funcname)
{
if(!initialized)
{
handle = dlopen("/System/Library/Frameworks/OpenCL.framework/Versions/Current/OpenCL", RTLD_LAZY);
const char* oclpath = getenv("OPENCV_OPENCL_RUNTIME");
oclpath = oclpath && strlen(oclpath) > 0 ? oclpath :
"/System/Library/Frameworks/OpenCL.framework/Versions/Current/OpenCL";
handle = dlopen(oclpath, RTLD_LAZY);
initialized = true;
g_haveOpenCL = handle != 0 && dlsym(handle, oclFuncToCheck) != 0;
if( g_haveOpenCL )
fprintf(stderr, "Succesffuly loaded OpenCL v1.1+ runtime from %s\n", oclpath);
else
fprintf(stderr, "Failed to load OpenCL runtime\n");
}
if(!handle)
return 0;
@ -1212,16 +1219,13 @@ namespace cv { namespace ocl {
struct UMat2D
{
UMat2D(const UMat& m, int accessFlags)
UMat2D(const UMat& m)
{
CV_Assert(m.dims == 2);
data = (cl_mem)m.handle(accessFlags);
offset = m.offset;
step = m.step;
rows = m.rows;
cols = m.cols;
}
cl_mem data;
size_t offset;
size_t step;
int rows;
@ -1230,10 +1234,8 @@ struct UMat2D
struct UMat3D
{
UMat3D(const UMat& m, int accessFlags)
UMat3D(const UMat& m)
{
CV_Assert(m.dims == 3);
data = (cl_mem)m.handle(accessFlags);
offset = m.offset;
step = m.step.p[1];
slicestep = m.step.p[0];
@ -1241,7 +1243,6 @@ struct UMat3D
rows = m.size.p[1];
cols = m.size.p[2];
}
cl_mem data;
size_t offset;
size_t slicestep;
size_t step;
@ -1315,7 +1316,7 @@ void setUseOpenCL(bool flag)
}
}
void finish()
void finish2()
{
Queue::getDefault().finish();
}
@ -1528,7 +1529,7 @@ String Device::OpenCLVersion() const
{ return p ? p->getStrProp(CL_DEVICE_EXTENSIONS) : String(); }
String Device::driverVersion() const
{ return p ? p->getStrProp(CL_DEVICE_EXTENSIONS) : String(); }
{ return p ? p->getStrProp(CL_DRIVER_VERSION) : String(); }
int Device::type() const
{ return p ? p->getProp<cl_device_type, int>(CL_DEVICE_TYPE) : 0; }
@ -1705,14 +1706,14 @@ size_t Device::profilingTimerResolution() const
const Device& Device::getDefault()
{
const Context& ctx = Context::getDefault();
const Context2& ctx = Context2::getDefault();
int idx = TLSData::get()->device;
return ctx.device(idx);
}
/////////////////////////////////////////////////////////////////////////////////////////
struct Context::Impl
struct Context2::Impl
{
Impl(int dtype0)
{
@ -1777,7 +1778,7 @@ struct Context::Impl
devices.clear();
}
Program getProg(const ProgramSource& src,
Program getProg(const ProgramSource2& src,
const String& buildflags, String& errmsg)
{
String prefix = Program::getPrefix(buildflags);
@ -1787,7 +1788,8 @@ struct Context::Impl
return it->second;
//String filename = format("%08x%08x_%08x%08x.clb2",
Program prog(src, buildflags, errmsg);
phash.insert(std::pair<HashKey,Program>(k, prog));
if(prog.ptr())
phash.insert(std::pair<HashKey,Program>(k, prog));
return prog;
}
@ -1797,7 +1799,7 @@ struct Context::Impl
std::vector<Device> devices;
bool initialized;
typedef ProgramSource::hash_t hash_t;
typedef ProgramSource2::hash_t hash_t;
struct HashKey
{
@ -1812,18 +1814,18 @@ struct Context::Impl
};
Context::Context()
Context2::Context2()
{
p = 0;
}
Context::Context(int dtype)
Context2::Context2(int dtype)
{
p = 0;
create(dtype);
}
bool Context::create(int dtype0)
bool Context2::create(int dtype0)
{
if( !haveOpenCL() )
return false;
@ -1838,19 +1840,19 @@ bool Context::create(int dtype0)
return p != 0;
}
Context::~Context()
Context2::~Context2()
{
p->release();
}
Context::Context(const Context& c)
Context2::Context2(const Context2& c)
{
p = (Impl*)c.p;
if(p)
p->addref();
}
Context& Context::operator = (const Context& c)
Context2& Context2::operator = (const Context2& c)
{
Impl* newp = (Impl*)c.p;
if(newp)
@ -1861,30 +1863,30 @@ Context& Context::operator = (const Context& c)
return *this;
}
void* Context::ptr() const
void* Context2::ptr() const
{
return p->handle;
}
size_t Context::ndevices() const
size_t Context2::ndevices() const
{
return p ? p->devices.size() : 0;
}
const Device& Context::device(size_t idx) const
const Device& Context2::device(size_t idx) const
{
static Device dummy;
return !p || idx >= p->devices.size() ? dummy : p->devices[idx];
}
Context& Context::getDefault()
Context2& Context2::getDefault()
{
static Context ctx;
static Context2 ctx;
if( !ctx.p && haveOpenCL() )
{
// do not create new Context right away.
// do not create new Context2 right away.
// First, try to retrieve existing context of the same type.
// In its turn, Platform::getContext() may call Context::create()
// In its turn, Platform::getContext() may call Context2::create()
// if there is no such context.
ctx.create(Device::TYPE_ACCELERATOR);
if(!ctx.p)
@ -1898,7 +1900,7 @@ Context& Context::getDefault()
return ctx;
}
Program Context::getProg(const ProgramSource& prog,
Program Context2::getProg(const ProgramSource2& prog,
const String& buildopts, String& errmsg)
{
return p ? p->getProg(prog, buildopts, errmsg) : Program();
@ -1906,14 +1908,14 @@ Program Context::getProg(const ProgramSource& prog,
struct Queue::Impl
{
Impl(const Context& c, const Device& d)
Impl(const Context2& c, const Device& d)
{
refcount = 1;
const Context* pc = &c;
const Context2* pc = &c;
cl_context ch = (cl_context)pc->ptr();
if( !ch )
{
pc = &Context::getDefault();
pc = &Context2::getDefault();
ch = (cl_context)pc->ptr();
}
cl_device_id dh = (cl_device_id)d.ptr();
@ -1943,7 +1945,7 @@ Queue::Queue()
p = 0;
}
Queue::Queue(const Context& c, const Device& d)
Queue::Queue(const Context2& c, const Device& d)
{
p = 0;
create(c, d);
@ -1973,7 +1975,7 @@ Queue::~Queue()
p->release();
}
bool Queue::create(const Context& c, const Device& d)
bool Queue::create(const Context2& c, const Device& d)
{
if(p)
p->release();
@ -1996,7 +1998,7 @@ Queue& Queue::getDefault()
{
Queue& q = TLSData::get()->oclQueue;
if( !q.p )
q.create(Context::getDefault());
q.create(Context2::getDefault());
return q;
}
@ -2008,15 +2010,20 @@ static cl_command_queue getQueue(const Queue& q)
return qq;
}
KernelArg::KernelArg(int _flags, UMat* _m, void* _obj, size_t _sz)
: flags(_flags), m(_m), obj(_obj), sz(_sz)
KernelArg::KernelArg()
: flags(0), m(0), obj(0), sz(0), wscale(1)
{
}
KernelArg::KernelArg(int _flags, UMat* _m, int _wscale, const void* _obj, size_t _sz)
: flags(_flags), m(_m), obj(_obj), sz(_sz), wscale(_wscale)
{
}
KernelArg KernelArg::Constant(const Mat& m)
{
CV_Assert(m.isContinuous());
return KernelArg(CONSTANT, 0, m.data, m.total()*m.elemSize());
return KernelArg(CONSTANT, 0, 1, m.data, m.total()*m.elemSize());
}
@ -2099,8 +2106,8 @@ Kernel::Kernel(const char* kname, const Program& prog)
create(kname, prog);
}
Kernel::Kernel(const char* kname, const ProgramSource& src,
const String& buildopts, String& errmsg)
Kernel::Kernel(const char* kname, const ProgramSource2& src,
const String& buildopts, String* errmsg)
{
p = 0;
create(kname, src, buildopts, errmsg);
@ -2143,15 +2150,17 @@ bool Kernel::create(const char* kname, const Program& prog)
return p != 0;
}
bool Kernel::create(const char* kname, const ProgramSource& src,
const String& buildopts, String& errmsg)
bool Kernel::create(const char* kname, const ProgramSource2& src,
const String& buildopts, String* errmsg)
{
if(p)
{
p->release();
p = 0;
}
const Program& prog = Context::getDefault().getProg(src, buildopts, errmsg);
String tempmsg;
if( !errmsg ) errmsg = &tempmsg;
const Program& prog = Context2::getDefault().getProg(src, buildopts, *errmsg);
return create(kname, prog);
}
@ -2160,55 +2169,91 @@ void* Kernel::ptr() const
return p ? p->handle : 0;
}
void Kernel::set(int i, const void* value, size_t sz)
bool Kernel::empty() const
{
CV_Assert( p && clSetKernelArg(p->handle, (cl_uint)i, sz, value) >= 0 );
return ptr() == 0;
}
int Kernel::set(int i, const void* value, size_t sz)
{
CV_Assert(i >= 0);
if( i == 0 )
p->cleanupUMats();
if( !p || !p->handle || clSetKernelArg(p->handle, (cl_uint)i, sz, value) < 0 )
return -1;
return i+1;
}
void Kernel::set(int i, const UMat& m)
int Kernel::set(int i, const UMat& m)
{
set(i, KernelArg(KernelArg::READ_WRITE, (UMat*)&m, 0, 0));
return set(i, KernelArg(KernelArg::READ_WRITE, (UMat*)&m, 0, 0));
}
void Kernel::set(int i, const KernelArg& arg)
int Kernel::set(int i, const KernelArg& arg)
{
CV_Assert( p && p->handle );
CV_Assert( i >= 0 );
if( i == 0 )
p->cleanupUMats();
if( !p || !p->handle )
return -1;
if( arg.m )
{
int accessFlags = ((arg.flags & KernelArg::READ_ONLY) ? ACCESS_READ : 0) +
((arg.flags & KernelArg::WRITE_ONLY) ? ACCESS_WRITE : 0);
cl_mem h = (cl_mem)arg.m->handle(accessFlags);
if( arg.m->dims <= 2 )
{
UMat2D u2d(*arg.m, accessFlags);
clSetKernelArg(p->handle, (cl_uint)i, sizeof(u2d), &u2d);
UMat2D u2d(*arg.m);
clSetKernelArg(p->handle, (cl_uint)i, sizeof(h), &h);
clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u2d.step), &u2d.step);
clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u2d.offset), &u2d.offset);
i += 3;
if( !(arg.flags & KernelArg::NO_SIZE) )
{
int cols = u2d.cols*arg.wscale;
clSetKernelArg(p->handle, (cl_uint)i, sizeof(u2d.rows), &u2d.rows);
clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u2d.cols), &cols);
i += 2;
}
}
else
{
UMat3D u3d(*arg.m, accessFlags);
clSetKernelArg(p->handle, (cl_uint)i, sizeof(u3d), &u3d);
UMat3D u3d(*arg.m);
clSetKernelArg(p->handle, (cl_uint)i, sizeof(h), &h);
clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.slicestep), &u3d.slicestep);
clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.step), &u3d.step);
clSetKernelArg(p->handle, (cl_uint)(i+3), sizeof(u3d.offset), &u3d.offset);
i += 4;
if( !(arg.flags & KernelArg::NO_SIZE) )
{
int cols = u3d.cols*arg.wscale;
clSetKernelArg(p->handle, (cl_uint)i, sizeof(u3d.slices), &u3d.rows);
clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.rows), &u3d.rows);
clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.cols), &cols);
i += 3;
}
}
p->addUMat(*arg.m);
return i;
}
else
{
clSetKernelArg(p->handle, (cl_uint)i, arg.sz, arg.obj);
}
clSetKernelArg(p->handle, (cl_uint)i, arg.sz, arg.obj);
return i+1;
}
void Kernel::run(int dims, size_t offset[], size_t globalsize[], size_t localsize[],
bool Kernel::run(int dims, size_t globalsize[], size_t localsize[],
bool sync, const Queue& q)
{
CV_Assert(p && p->handle && p->e == 0);
if(!p || !p->handle || p->e != 0)
return false;
cl_command_queue qq = getQueue(q);
clEnqueueNDRangeKernel(qq, p->handle, (cl_uint)dims,
offset, globalsize, localsize, 0, 0,
sync ? 0 : &p->e);
if( sync )
size_t offset[CV_MAX_DIM] = {0};
cl_int retval = clEnqueueNDRangeKernel(qq, p->handle, (cl_uint)dims,
offset, globalsize, localsize, 0, 0,
sync ? 0 : &p->e);
if( sync || retval < 0 )
{
clFinish(qq);
p->cleanupUMats();
@ -2218,14 +2263,17 @@ void Kernel::run(int dims, size_t offset[], size_t globalsize[], size_t localsiz
p->addref();
clSetEventCallback(p->e, CL_COMPLETE, oclCleanupCallback, p);
}
return retval >= 0;
}
void Kernel::runTask(bool sync, const Queue& q)
bool Kernel::runTask(bool sync, const Queue& q)
{
CV_Assert(p && p->handle && p->e == 0);
if(!p || !p->handle || p->e != 0)
return false;
cl_command_queue qq = getQueue(q);
clEnqueueTask(qq, p->handle, 0, 0, sync ? 0 : &p->e);
if( sync )
cl_int retval = clEnqueueTask(qq, p->handle, 0, 0, sync ? 0 : &p->e);
if( sync || retval < 0 )
{
clFinish(qq);
p->cleanupUMats();
@ -2235,6 +2283,7 @@ void Kernel::runTask(bool sync, const Queue& q)
p->addref();
clSetEventCallback(p->e, CL_COMPLETE, oclCleanupCallback, p);
}
return retval >= 0;
}
@ -2273,11 +2322,11 @@ size_t Kernel::localMemSize() const
struct Program::Impl
{
Impl(const ProgramSource& _src,
Impl(const ProgramSource2& _src,
const String& _buildflags, String& errmsg)
{
refcount = 1;
const Context& ctx = Context::getDefault();
const Context2& ctx = Context2::getDefault();
src = _src;
buildflags = _buildflags;
const String& srcstr = src.source();
@ -2293,17 +2342,20 @@ struct Program::Impl
void** deviceList = deviceListBuf;
for( i = 0; i < n; i++ )
deviceList[i] = ctx.device(i).ptr();
printf("Building the OpenCL program ...\n");
retval = clBuildProgram(handle, n,
(const cl_device_id*)deviceList,
buildflags.c_str(), 0, 0);
if( retval == CL_BUILD_PROGRAM_FAILURE )
{
char buf[1024];
char buf[1<<16];
size_t retsz = 0;
clGetProgramBuildInfo(handle, (cl_device_id)deviceList[0], CL_PROGRAM_BUILD_LOG,
sizeof(buf)-16, buf, &retsz);
errmsg = String(buf);
CV_Error_(Error::StsAssert, ("OpenCL program can not be built: %s", errmsg.c_str()));
}
CV_Assert(retval >= 0);
}
}
@ -2315,7 +2367,7 @@ struct Program::Impl
if(_buf.empty())
return;
String prefix0 = Program::getPrefix(buildflags);
const Context& ctx = Context::getDefault();
const Context2& ctx = Context2::getDefault();
const Device& dev = Device::getDefault();
const char* pos0 = _buf.c_str();
const char* pos1 = strchr(pos0, '\n');
@ -2366,7 +2418,7 @@ struct Program::Impl
IMPLEMENT_REFCOUNTABLE();
ProgramSource src;
ProgramSource2 src;
String buildflags;
cl_program handle;
};
@ -2374,7 +2426,7 @@ struct Program::Impl
Program::Program() { p = 0; }
Program::Program(const ProgramSource& src,
Program::Program(const ProgramSource2& src,
const String& buildflags, String& errmsg)
{
p = 0;
@ -2405,7 +2457,7 @@ Program::~Program()
p->release();
}
bool Program::create(const ProgramSource& src,
bool Program::create(const ProgramSource2& src,
const String& buildflags, String& errmsg)
{
if(p)
@ -2419,9 +2471,9 @@ bool Program::create(const ProgramSource& src,
return p != 0;
}
const ProgramSource& Program::source() const
const ProgramSource2& Program::source() const
{
static ProgramSource dummy;
static ProgramSource2 dummy;
return p ? p->src : dummy;
}
@ -2455,7 +2507,7 @@ String Program::getPrefix() const
String Program::getPrefix(const String& buildflags)
{
const Context& ctx = Context::getDefault();
const Context2& ctx = Context2::getDefault();
const Device& dev = ctx.device(0);
return format("name=%s\ndriver=%s\nbuildflags=%s\n",
dev.name().c_str(), dev.driverVersion().c_str(), buildflags.c_str());
@ -2463,7 +2515,7 @@ String Program::getPrefix(const String& buildflags)
////////////////////////////////////////////////////////////////////////////////////////
struct ProgramSource::Impl
struct ProgramSource2::Impl
{
Impl(const char* _src)
{
@ -2482,39 +2534,39 @@ struct ProgramSource::Impl
IMPLEMENT_REFCOUNTABLE();
String src;
ProgramSource::hash_t h;
ProgramSource2::hash_t h;
};
ProgramSource::ProgramSource()
ProgramSource2::ProgramSource2()
{
p = 0;
}
ProgramSource::ProgramSource(const char* prog)
ProgramSource2::ProgramSource2(const char* prog)
{
p = new Impl(prog);
}
ProgramSource::ProgramSource(const String& prog)
ProgramSource2::ProgramSource2(const String& prog)
{
p = new Impl(prog);
}
ProgramSource::~ProgramSource()
ProgramSource2::~ProgramSource2()
{
if(p)
p->release();
}
ProgramSource::ProgramSource(const ProgramSource& prog)
ProgramSource2::ProgramSource2(const ProgramSource2& prog)
{
p = prog.p;
if(p)
p->addref();
}
ProgramSource& ProgramSource::operator = (const ProgramSource& prog)
ProgramSource2& ProgramSource2::operator = (const ProgramSource2& prog)
{
Impl* newp = (Impl*)prog.p;
if(newp)
@ -2525,13 +2577,13 @@ ProgramSource& ProgramSource::operator = (const ProgramSource& prog)
return *this;
}
const String& ProgramSource::source() const
const String& ProgramSource2::source() const
{
static String dummy;
return p ? p->src : dummy;
}
ProgramSource::hash_t ProgramSource::hash() const
ProgramSource2::hash_t ProgramSource2::hash() const
{
return p ? p->h : 0;
}
@ -2551,7 +2603,7 @@ public:
return u;
}
void getBestFlags(const Context& ctx, int& createFlags, int& flags0) const
void getBestFlags(const Context2& ctx, int& createFlags, int& flags0) const
{
const Device& dev = ctx.device(0);
createFlags = CL_MEM_READ_WRITE;
@ -2574,7 +2626,7 @@ public:
total *= sizes[i];
}
Context& ctx = Context::getDefault();
Context2& ctx = Context2::getDefault();
int createFlags = 0, flags0 = 0;
getBestFlags(ctx, createFlags, flags0);
@ -2603,7 +2655,7 @@ public:
if(u->handle == 0)
{
CV_Assert(u->origdata != 0);
Context& ctx = Context::getDefault();
Context2& ctx = Context2::getDefault();
int createFlags = 0, flags0 = 0;
getBestFlags(ctx, createFlags, flags0);
@ -2848,7 +2900,6 @@ public:
new_srcofs, new_dstofs, new_sz, new_srcstep[0], new_srcstep[1],
new_dststep[0], new_dststep[1], dstptr, 0, 0, 0) >= 0 );
}
clFinish(q);
}
void upload(UMatData* u, const void* srcptr, int dims, const size_t sz[],
@ -2890,6 +2941,9 @@ public:
if( iscontinuous )
{
int crc = 0;
for( size_t i = 0; i < total; i++ )
crc ^= ((uchar*)srcptr)[i];
CV_Assert( clEnqueueWriteBuffer(q, (cl_mem)u->handle,
CL_TRUE, dstrawofs, total, srcptr, 0, 0, 0) >= 0 );
}
@ -2949,10 +3003,11 @@ public:
}
else
{
CV_Assert( clEnqueueCopyBufferRect(q, (cl_mem)src->handle, (cl_mem)dst->handle,
cl_int retval;
CV_Assert( (retval = clEnqueueCopyBufferRect(q, (cl_mem)src->handle, (cl_mem)dst->handle,
new_srcofs, new_dstofs, new_sz,
new_srcstep[0], new_srcstep[1], new_dststep[0], new_dststep[1],
0, 0, 0) >= 0 );
0, 0, 0)) >= 0 );
}
dst->markHostCopyObsolete(true);
@ -2969,4 +3024,61 @@ MatAllocator* getOpenCLAllocator()
return &allocator;
}
const char* typeToStr(int t)
{
static const char* tab[]=
{
"uchar", "uchar2", "uchar3", "uchar4",
"char", "char2", "char3", "char4",
"ushort", "ushort2", "ushort3", "ushort4",
"short", "short2", "short3", "short4",
"int", "int2", "int3", "int4",
"float", "float2", "float3", "float4",
"double", "double2", "double3", "double4",
"?", "?", "?", "?"
};
int cn = CV_MAT_CN(t);
return cn >= 4 ? "?" : tab[CV_MAT_DEPTH(t)*4 + cn-1];
}
const char* memopTypeToStr(int t)
{
static const char* tab[]=
{
"uchar", "uchar2", "uchar3", "uchar4",
"uchar", "uchar2", "uchar3", "uchar4",
"ushort", "ushort2", "ushort3", "ushort4",
"ushort", "ushort2", "ushort3", "ushort4",
"int", "int2", "int3", "int4",
"int", "int2", "int3", "int4",
"long", "long2", "long3", "long4",
"?", "?", "?", "?"
};
int cn = CV_MAT_CN(t);
return cn >= 4 ? "?" : tab[CV_MAT_DEPTH(t)*4 + cn-1];
}
const char* convertTypeStr(int sdepth, int ddepth, int cn, char* buf)
{
if( sdepth == ddepth )
return "noconvert";
const char *typestr = typeToStr(CV_MAKETYPE(ddepth, cn));
if( ddepth >= CV_32F ||
(ddepth == CV_32S && sdepth < CV_32S) ||
(ddepth == CV_16S && sdepth <= CV_8S) ||
(ddepth == CV_16U && sdepth == CV_8U))
{
sprintf(buf, "convert_%s", typestr);
}
else if( sdepth >= CV_32F )
{
sprintf(buf, "convert_%s%s_rte", typestr, (ddepth < CV_32S ? "_sat" : ""));
}
else
{
sprintf(buf, "convert_%s_sat", typestr);
}
return buf;
}
}}

@ -0,0 +1,307 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Jia Haipeng, jiahaipeng95@gmail.com
//
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the copyright holders or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
/*
Usage:
after compiling this program user gets a single kernel called KF.
the following flags should be passed:
1) one of "-D BINARY_OP", "-D UNARY_OP", "-D MASK_BINARY_OP" or "-D MASK_UNARY_OP"
2) the actual operation performed, one of "-D OP_...", see below the list of operations.
2a) "-D dstDepth=<destination depth> [-D cn=<num channels]"
for some operations, like min/max/and/or/xor it's enough
2b) "-D srcDepth1=<source1 depth> -D srcDepth2=<source2 depth> -D dstDepth=<destination depth>
-D workDepth=<work depth> [-D cn=<num channels>]" - for mixed-type operations
*/
#if defined (DOUBLE_SUPPORT)
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#elif defined (cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#endif
#endif
#define CV_32S 4
#define CV_32F 5
#define dstelem *(dstT*)(dstptr + dst_index)
#define noconvert(x) x
#ifndef workT
#define srcT1 dstT
#define srcT2 dstT
#define workT dstT
#define srcelem1 *(dstT*)(srcptr1 + src1_index)
#define srcelem2 *(dstT*)(srcptr2 + src2_index)
#define convertToDT noconvert
#else
#define srcelem1 convertToWT1(*(srcT1*)(srcptr1 + src1_index))
#define srcelem2 convertToWT2(*(srcT2*)(srcptr2 + src2_index))
#endif
#define EXTRA_PARAMS
#if defined OP_ADD_SAT
#define PROCESS_ELEM dstelem = add_sat(srcelem1, srcelem2)
#elif defined OP_ADD
#define PROCESS_ELEM dstelem = convertToDT(srcelem1 + srcelem2)
#elif defined OP_SUB_SAT
#define PROCESS_ELEM dstelem = sub_sat(srcelem1, srcelem2)
#elif defined OP_SUB
#define PROCESS_ELEM dstelem = convertToDT(srcelem1 - srcelem2)
#elif defined OP_RSUB_SAT
#define PROCESS_ELEM dstelem = sub_sat(srcelem2, srcelem1)
#elif defined OP_RSUB
#define PROCESS_ELEM dstelem = convertToDT(srcelem2 - srcelem1)
#elif defined OP_ABSDIFF
#define PROCESS_ELEM dstelem = abs_diff(srcelem1, srcelem2)
#elif defined OP_AND
#define PROCESS_ELEM dstelem = srcelem1 & srcelem2
#elif defined OP_OR
#define PROCESS_ELEM dstelem = srcelem1 | srcelem2
#elif defined OP_XOR
#define PROCESS_ELEM dstelem = srcelem1 ^ srcelem2
#elif defined OP_NOT
#define PROCESS_ELEM dstelem = ~srcelem1
#elif defined OP_MIN
#define PROCESS_ELEM dstelem = min(srcelem1, srcelem2)
#elif defined OP_MAX
#define PROCESS_ELEM dstelem = max(srcelem1, srcelem2)
#elif defined OP_MUL
#define PROCESS_ELEM dstelem = convertToDT(srcelem1 * srcelem2)
#elif defined OP_MUL_SCALE
#undef EXTRA_PARAMS
#define EXTRA_PARAMS , workT scale
#define PROCESS_ELEM dstelem = convertToDT(srcelem1 * srcelem2 * scale)
#elif defined OP_DIV
#define PROCESS_ELEM \
workT e2 = srcelem2, zero = (workT)(0); \
dstelem = convertToDT(e2 != zero ? srcelem1 / e2 : zero)
#elif defined OP_DIV_SCALE
#undef EXTRA_PARAMS
#define EXTRA_PARAMS , workT scale
#define PROCESS_ELEM \
workT e2 = srcelem2, zero = (workT)(0); \
dstelem = convertToDT(e2 != zero ? srcelem1 * scale / e2 : zero)
#elif defined OP_RECIP_SCALE
#undef EXTRA_PARAMS
#define EXTRA_PARAMS , workT scale
#define PROCESS_ELEM \
workT e1 = srcelem1, zero = (workT)(0); \
dstelem = convertToDT(e1 != zero ? scale / e1 : zero)
#elif defined OP_ADDW
#undef EXTRA_PARAMS
#define EXTRA_PARAMS , workT alpha, workT beta, workT gamma
#define PROCESS_ELEM dstelem = convertToDT(srcelem1*alpha + srcelem2*beta + gamma)
#elif defined OP_MAG
#define PROCESS_ELEM dstelem = hypot(srcelem1, srcelem2)
#elif defined OP_PHASE_RADIANS
#define PROCESS_ELEM \
workT tmp = atan2(srcelem2, srcelem1); \
if(tmp < 0) tmp += 6.283185307179586232; \
dstelem = tmp
#elif defined OP_PHASE_DEGREES
#define PROCESS_ELEM \
workT tmp = atan2(srcelem2, srcelem1)*57.29577951308232286465; \
if(tmp < 0) tmp += 360; \
dstelem = tmp
#elif defined OP_EXP
#define PROCESS_ELEM dstelem = exp(srcelem1)
#elif defined OP_SQRT
#define PROCESS_ELEM dstelem = sqrt(srcelem1)
#elif defined OP_LOG
#define PROCESS_ELEM dstelem = log(abs(srcelem1))
#elif defined OP_CMP
#define PROCESS_ELEM dstelem = convert_uchar(srcelem1 CMP_OPERATOR srcelem2 ? 255 : 0)
#elif defined OP_CONVERT
#define PROCESS_ELEM dstelem = convertToDT(srcelem1)
#elif defined OP_CONVERT_SCALE
#undef EXTRA_PARAMS
#define EXTRA_PARAMS , workT alpha, workT beta
#define PROCESS_ELEM dstelem = convertToDT(srcelem1*alpha + beta)
#else
#error "unknown op type"
#endif
#if defined UNARY_OP || defined MASK_UNARY_OP
#undef srcelem2
#if defined OP_AND || defined OP_OR || defined OP_XOR || defined OP_ADD || defined OP_SAT_ADD || \
defined OP_SUB || defined OP_SAT_SUB || defined OP_RSUB || defined OP_SAT_RSUB || \
defined OP_ABSDIFF || defined OP_CMP || defined OP_MIN || defined OP_MAX
#undef EXTRA_PARAMS
#define EXTRA_PARAMS , workT srcelem2
#endif
#endif
#if defined BINARY_OP
__kernel void KF(__global const uchar* srcptr1, int srcstep1, int srcoffset1,
__global const uchar* srcptr2, int srcstep2, int srcoffset2,
__global uchar* dstptr, int dststep, int dstoffset,
int rows, int cols EXTRA_PARAMS )
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, srcstep1, x*sizeof(srcT1) + srcoffset1);
int src2_index = mad24(y, srcstep2, x*sizeof(srcT2) + srcoffset2);
int dst_index = mad24(y, dststep, x*sizeof(dstT) + dstoffset);
PROCESS_ELEM;
//printf("(x=%d, y=%d). %d, %d, %d\n", x, y, (int)srcelem1, (int)srcelem2, (int)dstelem);
}
}
#elif defined MASK_BINARY_OP
__kernel void KF(__global const uchar* srcptr1, int srcstep1, int srcoffset1,
__global const uchar* srcptr2, int srcstep2, int srcoffset2,
__global const uchar* mask, int maskstep, int maskoffset,
__global uchar* dstptr, int dststep, int dstoffset,
int rows, int cols EXTRA_PARAMS )
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int mask_index = mad24(y, maskstep, x + maskoffset);
if( mask[mask_index] )
{
int src1_index = mad24(y, srcstep1, x*sizeof(srcT1) + srcoffset1);
int src2_index = mad24(y, srcstep2, x*sizeof(srcT2) + srcoffset2);
int dst_index = mad24(y, dststep, x*sizeof(dstT) + dstoffset);
PROCESS_ELEM;
}
}
}
#elif defined UNARY_OP
__kernel void KF(__global const uchar* srcptr1, int srcstep1, int srcoffset1,
__global uchar* dstptr, int dststep, int dstoffset,
int rows, int cols EXTRA_PARAMS )
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, srcstep1, x*sizeof(srcT1) + srcoffset1);
int dst_index = mad24(y, dststep, x*sizeof(dstT) + dstoffset);
PROCESS_ELEM;
}
}
#elif defined MASK_UNARY_OP
__kernel void KF(__global const uchar* srcptr1, int srcstep1, int srcoffset1,
__global const uchar* mask, int maskstep, int maskoffset,
__global uchar* dstptr, int dststep, int dstoffset,
int rows, int cols EXTRA_PARAMS )
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int mask_index = mad24(y, maskstep, x + maskoffset);
if( mask[mask_index] )
{
int src1_index = mad24(y, srcstep1, x*sizeof(srcT1) + srcoffset1);
int dst_index = mad24(y, dststep, x*sizeof(dstT) + dstoffset);
PROCESS_ELEM;
}
}
}
#else
#error "Unknown operation type"
#endif

@ -0,0 +1,74 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the copyright holders or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
__kernel void setMask(__global const uchar* mask, int maskstep, int maskoffset,
__global uchar* dstptr, int dststep, int dstoffset,
int rows, int cols, dstT value )
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int mask_index = mad24(y, maskstep, x + maskoffset);
if( mask[mask_index] )
{
int dst_index = mad24(y, dststep, x*sizeof(dstT) + dstoffset);
*(dstT*)(dstptr + dst_index) = value;
}
}
}
__kernel void set(__global uchar* dstptr, int dststep, int dstoffset,
int rows, int cols, dstT value )
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int dst_index = mad24(y, dststep, x*sizeof(dstT) + dstoffset);
*(dstT*)(dstptr + dst_index) = value;
}
}

@ -0,0 +1,96 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Peng Xiao, pengxiao@multicorewareinc.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other oclMaterials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the uintel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business uinterruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
typedef float2 cfloat;
inline cfloat cmulf(cfloat a, cfloat b)
{
return (cfloat)( a.x*b.x - a.y*b.y, a.x*b.y + a.y*b.x);
}
inline cfloat conjf(cfloat a)
{
return (cfloat)( a.x, - a.y );
}
__kernel void
mulAndScaleSpectrumsKernel(
__global const cfloat* a,
__global const cfloat* b,
float scale,
__global cfloat* dst,
uint cols,
uint rows,
uint mstep
)
{
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint idx = mad24(y, mstep / sizeof(cfloat), x);
if (x < cols && y < rows)
{
cfloat v = cmulf(a[idx], b[idx]);
dst[idx] = (cfloat)( v.x * scale, v.y * scale );
}
}
__kernel void
mulAndScaleSpectrumsKernel_CONJ(
__global const cfloat* a,
__global const cfloat* b,
float scale,
__global cfloat* dst,
uint cols,
uint rows,
uint mstep
)
{
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint idx = mad24(y, mstep / sizeof(cfloat), x);
if (x < cols && y < rows)
{
cfloat v = cmulf(a[idx], conjf(b[idx]));
dst[idx] = (cfloat)( v.x * scale, v.y * scale );
}
}

@ -0,0 +1,73 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the copyright holders or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
__kernel void polarToCart(__global const uchar* mask, int maskstep, int maskoffset,
__global uchar* dstptr, int dststep, int dstoffset,
int rows, int cols, dstT value )
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int mask_index = mad24(y, maskstep, x + maskoffset);
if( mask[mask_index] )
{
int dst_index = mad24(y, dststep, x*sizeof(dstT) + dstoffset);
*(dstT*)(dstptr + dst_index) = value;
}
}
}
__kernel void cartToPolar(__global uchar* dstptr, int dststep, int dstoffset,
int rows, int cols, dstT value )
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int dst_index = mad24(y, dststep, x*sizeof(dstT) + dstoffset);
*(dstT*)(dstptr + dst_index) = value;
}
}

@ -0,0 +1,104 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Shengen Yan,yanshengen@gmail.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if defined (DOUBLE_SUPPORT)
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#elif defined (cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#endif
#endif
#if FUNC_SUM
#define FUNC(a, b) b += a;
#elif FUNC_ABS_SUM
#define FUNC(a, b) b += a >= (dstT)(0) ? a : -a;
#elif FUNC_SQR_SUM
#define FUNC(a, b) b += a * a;
#else
#error No sum function
#endif
/**************************************Array buffer SUM**************************************/
__kernel void arithm_op_sum(int cols,int invalid_cols,int offset,int elemnum,int groupnum,
__global srcT *src, __global dstT *dst)
{
unsigned int lid = get_local_id(0);
unsigned int gid = get_group_id(0);
unsigned int id = get_global_id(0);
unsigned int idx = offset + id + (id / cols) * invalid_cols;
__local dstT localmem_sum[128];
dstT sum = (dstT)(0), temp;
for (int grainSize = groupnum << 8; id < elemnum; id += grainSize)
{
idx = offset + id + (id / cols) * invalid_cols;
temp = convertToDstT(src[idx]);
FUNC(temp, sum);
}
if (lid > 127)
localmem_sum[lid - 128] = sum;
barrier(CLK_LOCAL_MEM_FENCE);
if (lid < 128)
localmem_sum[lid] = sum + localmem_sum[lid];
barrier(CLK_LOCAL_MEM_FENCE);
for (int lsize = 64; lsize > 0; lsize >>= 1)
{
if (lid < lsize)
{
int lid2 = lsize + lid;
localmem_sum[lid] = localmem_sum[lid] + localmem_sum[lid2];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (lid == 0)
dst[gid] = localmem_sum[0];
}

@ -67,6 +67,8 @@
#define GET_OPTIMIZED(func) (func)
#endif
#include "opencl_kernels.hpp"
namespace cv
{
@ -205,13 +207,30 @@ enum { BLOCK_SIZE = 1024 };
inline bool checkScalar(const Mat& sc, int atype, int sckind, int akind)
{
if( sc.dims > 2 || (sc.cols != 1 && sc.rows != 1) || !sc.isContinuous() )
if( sc.dims > 2 || !sc.isContinuous() )
return false;
Size sz = sc.size();
if(sz.width != 1 && sz.height != 1)
return false;
int cn = CV_MAT_CN(atype);
if( akind == _InputArray::MATX && sckind != _InputArray::MATX )
return false;
return sz == Size(1, 1) || sz == Size(1, cn) || sz == Size(cn, 1) ||
(sz == Size(1, 4) && sc.type() == CV_64F && cn <= 4);
}
inline bool checkScalar(InputArray sc, int atype, int sckind, int akind)
{
if( sc.dims() > 2 || !sc.isContinuous() )
return false;
Size sz = sc.size();
if(sz.width != 1 && sz.height != 1)
return false;
int cn = CV_MAT_CN(atype);
if( akind == _InputArray::MATX && sckind != _InputArray::MATX )
return false;
return sc.size() == Size(1, 1) || sc.size() == Size(1, cn) || sc.size() == Size(cn, 1) ||
(sc.size() == Size(1, 4) && sc.type() == CV_64F && cn <= 4);
return sz == Size(1, 1) || sz == Size(1, cn) || sz == Size(cn, 1) ||
(sz == Size(1, 4) && sc.type() == CV_64F && cn <= 4);
}
void convertAndUnrollScalar( const Mat& sc, int buftype, uchar* scbuf, size_t blocksize );
@ -227,7 +246,10 @@ struct TLSData
static TLSData* get();
};
namespace ocl { MatAllocator* getOpenCLAllocator(); }
namespace ocl
{
MatAllocator* getOpenCLAllocator();
}
}

@ -197,6 +197,7 @@ UMat Mat::getUMat(int accessFlags) const
if(!u)
return hdr;
UMat::getStdAllocator()->allocate(u, accessFlags);
hdr.flags = flags;
setSize(hdr, dims, size.p, step.p);
finalizeHdr(hdr);
hdr.u = u;
@ -548,7 +549,8 @@ Mat UMat::getMat(int accessFlags) const
CV_Assert(u->data != 0);
Mat hdr(dims, size.p, type(), u->data + offset, step.p);
hdr.u = u;
hdr.datastart = hdr.data = u->data;
hdr.datastart = u->data;
hdr.data = hdr.datastart + offset;
hdr.datalimit = hdr.dataend = u->data + u->size;
CV_XADD(&hdr.u->refcount, 1);
return hdr;
@ -617,7 +619,7 @@ void UMat::copyTo(OutputArray _dst) const
void* dsthandle = dst.handle(ACCESS_WRITE);
if( srchandle == dsthandle && dst.offset == offset )
return;
ndoffset(dstofs);
dst.ndoffset(dstofs);
CV_Assert(u->currAllocator == dst.u->currAllocator);
u->currAllocator->copy(u, dst.u, dims, sz, srcofs, step.p, dstofs, dst.step.p, false);
}
@ -633,6 +635,50 @@ void UMat::convertTo(OutputArray, int, double, double) const
CV_Error(Error::StsNotImplemented, "");
}
UMat& UMat::setTo(InputArray _value, InputArray _mask)
{
bool haveMask = !_mask.empty();
int t = type(), cn = CV_MAT_CN(t);
if( dims <= 2 && cn <= 4 && ocl::useOpenCL() )
{
Mat value = _value.getMat();
CV_Assert( checkScalar(value, type(), _value.kind(), _InputArray::UMAT) );
double buf[4];
convertAndUnrollScalar(value, t, (uchar*)buf, 1);
char opts[1024];
sprintf(opts, "-D dstT=%s", ocl::memopTypeToStr(t));
ocl::Kernel setK(haveMask ? "setMask" : "set", ocl::core::copyset_oclsrc, opts);
if( !setK.empty() )
{
ocl::KernelArg scalararg(0, 0, 0, buf, CV_ELEM_SIZE(t));
UMat mask;
if( haveMask )
{
mask = _mask.getUMat();
CV_Assert( mask.size() == size() && mask.type() == CV_8U );
ocl::KernelArg maskarg = ocl::KernelArg::ReadOnlyNoSize(mask);
ocl::KernelArg dstarg = ocl::KernelArg::ReadWrite(*this);
setK.args(maskarg, dstarg, scalararg);
}
else
{
ocl::KernelArg dstarg = ocl::KernelArg::WriteOnly(*this);
setK.args(dstarg, scalararg);
}
size_t globalsize[] = { cols, rows };
if( setK.run(2, globalsize, 0, false) )
return *this;
}
}
Mat m = getMat(haveMask ? ACCESS_RW : ACCESS_WRITE);
m.setTo(_value, _mask);
return *this;
}
UMat& UMat::operator = (const Scalar&)
{
CV_Error(Error::StsNotImplemented, "");

@ -91,11 +91,11 @@ bool CV_UMatTest::TestUMat()
{
try
{
Mat a(100, 100, CV_16S), b;
Mat a(100, 100, CV_16SC2), b, c;
randu(a, Scalar::all(-100), Scalar::all(100));
Rect roi(1, 3, 10, 20);
Mat ra(a, roi), rb;
UMat ua, ura;
Rect roi(1, 3, 5, 4);
Mat ra(a, roi), rb, rc, rc0;
UMat ua, ura, ub, urb, uc, urc;
a.copyTo(ua);
ua.copyTo(b);
CHECK_DIFF(a, b);
@ -112,6 +112,71 @@ bool CV_UMatTest::TestUMat()
}
ra.copyTo(rb);
CHECK_DIFF(ra, rb);
b = a.clone();
ra = a(roi);
rb = b(roi);
randu(b, Scalar::all(-100), Scalar::all(100));
b.copyTo(ub);
urb = ub(roi);
/*std::cout << "==============================================\nbefore op (CPU):\n";
std::cout << "ra: " << ra << std::endl;
std::cout << "rb: " << rb << std::endl;*/
ra.copyTo(ura);
rb.copyTo(urb);
ra.release();
rb.release();
ura.copyTo(ra);
urb.copyTo(rb);
/*std::cout << "==============================================\nbefore op (GPU):\n";
std::cout << "ra: " << ra << std::endl;
std::cout << "rb: " << rb << std::endl;*/
cv::max(ra, rb, rc);
cv::max(ura, urb, urc);
urc.copyTo(rc0);
/*std::cout << "==============================================\nafter op:\n";
std::cout << "rc: " << rc << std::endl;
std::cout << "rc0: " << rc0 << std::endl;*/
CHECK_DIFF(rc0, rc);
{
UMat tmp = rc0.getUMat(ACCESS_WRITE);
cv::max(ura, urb, tmp);
}
CHECK_DIFF(rc0, rc);
ura.copyTo(urc);
cv::max(urc, urb, urc);
urc.copyTo(rc0);
CHECK_DIFF(rc0, rc);
rc = ra ^ rb;
cv::bitwise_xor(ura, urb, urc);
urc.copyTo(rc0);
/*std::cout << "==============================================\nafter op:\n";
std::cout << "ra: " << rc0 << std::endl;
std::cout << "rc: " << rc << std::endl;*/
CHECK_DIFF(rc0, rc);
rc = ra + rb;
cv::add(ura, urb, urc);
urc.copyTo(rc0);
CHECK_DIFF(rc0, rc);
cv::subtract(ra, Scalar::all(5), rc);
cv::subtract(ura, Scalar::all(5), urc);
urc.copyTo(rc0);
CHECK_DIFF(rc0, rc);
}
catch (const test_excep& e)
{

@ -511,9 +511,10 @@ public:
CV_WRAP virtual void release();
CV_WRAP virtual bool grab();
CV_WRAP virtual bool retrieve(CV_OUT Mat& image, int flag = 0);
CV_WRAP virtual bool retrieve(OutputArray image, int flag = 0);
virtual VideoCapture& operator >> (CV_OUT Mat& image);
CV_WRAP virtual bool read(CV_OUT Mat& image);
virtual VideoCapture& operator >> (CV_OUT UMat& image);
CV_WRAP virtual bool read(OutputArray image);
CV_WRAP virtual bool set(int propId, double value);
CV_WRAP virtual double get(int propId);

@ -515,7 +515,7 @@ bool VideoCapture::grab()
return cvGrabFrame(cap) != 0;
}
bool VideoCapture::retrieve(Mat& image, int channel)
bool VideoCapture::retrieve(OutputArray image, int channel)
{
IplImage* _img = cvRetrieveFrame(cap, channel);
if( !_img )
@ -533,7 +533,7 @@ bool VideoCapture::retrieve(Mat& image, int channel)
return true;
}
bool VideoCapture::read(Mat& image)
bool VideoCapture::read(OutputArray image)
{
if(grab())
retrieve(image);
@ -548,6 +548,12 @@ VideoCapture& VideoCapture::operator >> (Mat& image)
return *this;
}
VideoCapture& VideoCapture::operator >> (UMat& image)
{
read(image);
return *this;
}
bool VideoCapture::set(int propId, double value)
{
return cvSetCaptureProperty(cap, propId, value) != 0;

@ -2687,6 +2687,124 @@ struct mRGBA2RGBA
}
};
static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
{
bool ok = true;
UMat src = _src.getUMat(), dst;
Size sz = src.size(), dstSz = sz;
int scn = src.channels(), depth = src.depth(), bidx, dtype;
size_t globalsize[] = { src.cols, src.rows };
ocl::Kernel k;
if(depth != CV_8U && depth != CV_16U && depth != CV_32F)
return false;
switch (code)
{
/*
case COLOR_BGR2BGRA: case COLOR_RGB2BGRA: case COLOR_BGRA2BGR:
case COLOR_RGBA2BGR: case COLOR_RGB2BGR: case COLOR_BGRA2RGBA:
case COLOR_BGR2BGR565: case COLOR_BGR2BGR555: case COLOR_RGB2BGR565: case COLOR_RGB2BGR555:
case COLOR_BGRA2BGR565: case COLOR_BGRA2BGR555: case COLOR_RGBA2BGR565: case COLOR_RGBA2BGR555:
case COLOR_BGR5652BGR: case COLOR_BGR5552BGR: case COLOR_BGR5652RGB: case COLOR_BGR5552RGB:
case COLOR_BGR5652BGRA: case COLOR_BGR5552BGRA: case COLOR_BGR5652RGBA: case COLOR_BGR5552RGBA:
*/
case COLOR_BGR2GRAY:
case COLOR_BGRA2GRAY:
case COLOR_RGB2GRAY:
case COLOR_RGBA2GRAY:
{
CV_Assert(scn == 3 || scn == 4);
bidx = code == COLOR_BGR2GRAY || code == COLOR_BGRA2GRAY ? 0 : 2;
dtype = depth;
k.create("RGB2Gray", ocl::imgproc::cvtcolor_oclsrc,
format("-D depth=%d -D scn=%d -D dcn=1 -D bidx=%d", depth, scn, bidx));
break;
}
case COLOR_GRAY2BGR:
case COLOR_GRAY2BGRA:
{
CV_Assert(scn == 1);
dcn = code == COLOR_GRAY2BGRA ? 4 : 3;
dtype = CV_MAKETYPE(depth, dcn);
k.create("Gray2RGB", ocl::imgproc::cvtcolor_oclsrc,
format("-D depth=%d -D scn=1 -D dcn=%d", depth, dcn));
break;
}
case COLOR_BGR2YUV:
case COLOR_RGB2YUV:
{
CV_Assert(scn == 3 || scn == 4);
bidx = code == COLOR_RGB2YUV ? 0 : 2;
k.create("RGB2YUV", ocl::imgproc::cvtcolor_oclsrc,
format("-D depth=%d -D scn=%d -D dcn=3 -D bidx=%d", depth, scn, bidx));
break;
}
case COLOR_YUV2BGR:
case COLOR_YUV2RGB:
{
if(dcn < 0) dcn = 3;
CV_Assert(dcn == 3 || dcn == 4);
bidx = code == COLOR_YUV2RGB ? 0 : 2;
k.create("YUV2RGB", ocl::imgproc::cvtcolor_oclsrc,
format("-D depth=%d -D scn=3 -D dcn=%d -D bidx=%d", depth, dcn, bidx));
break;
}
case COLOR_YUV2RGB_NV12:
case COLOR_YUV2BGR_NV12:
case COLOR_YUV2RGBA_NV12:
case COLOR_YUV2BGRA_NV12:
{
CV_Assert( scn == 1 );
CV_Assert( sz.width % 2 == 0 && sz.height % 3 == 0 && depth == CV_8U );
dcn = code == COLOR_YUV2BGRA_NV12 || code == COLOR_YUV2RGBA_NV12 ? 4 : 3;
bidx = code == COLOR_YUV2BGRA_NV12 || code == COLOR_YUV2BGR_NV12 ? 0 : 2;
dstSz = Size(sz.width, sz.height * 2 / 3);
globalsize[0] = dstSz.height/2;
globalsize[1] = dstSz.width/2;
k.create("YUV2RGBA_NV12", ocl::imgproc::cvtcolor_oclsrc,
format("-D depth=0 -D scn=1 -D dcn=%d -D bidx=%d", dcn, bidx));
break;
}
case COLOR_BGR2YCrCb:
case COLOR_RGB2YCrCb:
{
CV_Assert(scn == 3 || scn == 4);
bidx = code == COLOR_BGR2YCrCb ? 0 : 2;
k.create("RGB2YCrCb", ocl::imgproc::cvtcolor_oclsrc,
format("-D depth=%d -D scn=%d -D dcn=3 -D bidx=%d", depth, scn, bidx));
break;
}
case COLOR_YCrCb2BGR:
case COLOR_YCrCb2RGB:
break;
/*
case COLOR_BGR5652GRAY: case COLOR_BGR5552GRAY:
case COLOR_GRAY2BGR565: case COLOR_GRAY2BGR555:
case COLOR_BGR2YCrCb: case COLOR_RGB2YCrCb:
case COLOR_BGR2XYZ: case COLOR_RGB2XYZ:
case COLOR_XYZ2BGR: case COLOR_XYZ2RGB:
case COLOR_BGR2HSV: case COLOR_RGB2HSV: case COLOR_BGR2HSV_FULL: case COLOR_RGB2HSV_FULL:
case COLOR_BGR2HLS: case COLOR_RGB2HLS: case COLOR_BGR2HLS_FULL: case COLOR_RGB2HLS_FULL:
case COLOR_HSV2BGR: case COLOR_HSV2RGB: case COLOR_HSV2BGR_FULL: case COLOR_HSV2RGB_FULL:
case COLOR_HLS2BGR: case COLOR_HLS2RGB: case COLOR_HLS2BGR_FULL: case COLOR_HLS2RGB_FULL:
*/
default:
;
}
if( !k.empty() )
{
_dst.create(dstSz, dtype);
dst = _dst.getUMat();
k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst));
ok = k.run(2, globalsize, 0, false);
}
return ok;
}
}//namespace cv
//////////////////////////////////////////////////////////////////////////////////////////
@ -2695,9 +2813,15 @@ struct mRGBA2RGBA
void cv::cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
{
bool use_opencl = ocl::useOpenCL() && _dst.kind() == _InputArray::UMAT;
int stype = _src.type();
int scn = CV_MAT_CN(stype), depth = CV_MAT_DEPTH(stype), bidx;
if( use_opencl && ocl_cvtColor(_src, _dst, code, dcn) )
return;
Mat src = _src.getMat(), dst;
Size sz = src.size();
int scn = src.channels(), depth = src.depth(), bidx;
CV_Assert( depth == CV_8U || depth == CV_16U || depth == CV_32F );

@ -1901,8 +1901,43 @@ private:
};
#endif
static bool ocl_resize( InputArray _src, OutputArray _dst,
double fx, double fy, int interpolation)
{
int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
if( !(cn <= 4 &&
(interpolation == INTER_NEAREST ||
(interpolation == INTER_LINEAR && (depth == CV_8U || depth == CV_32F)))) )
return false;
UMat src = _src.getUMat(), dst = _dst.getUMat();
ocl::Kernel k;
if (interpolation == INTER_LINEAR)
{
int wdepth = depth == CV_8U ? CV_32S : CV_32F;
int wtype = CV_MAKETYPE(wdepth, cn);
char buf[2][32];
k.create("resizeLN", ocl::imgproc::resize_oclsrc,
format("-D INTER_LINEAR -D depth=%s -D PIXTYPE=%s -D WORKTYPE=%s -D convertToWT=%s -D convertToDT=%s",
depth, ocl::typeToStr(type), ocl::typeToStr(wtype),
ocl::convertTypeStr(depth, wdepth, cn, buf[0]),
ocl::convertTypeStr(wdepth, depth, cn, buf[1])));
}
else if (interpolation == INTER_NEAREST)
{
k.create("resizeNN", ocl::imgproc::resize_oclsrc,
format("-D INTER_NEAREST -D PIXTYPE=%s", ocl::memopTypeToStr(type) ));
}
if( k.empty() )
return false;
k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
(float)(1./fx), (float)(1./fy));
size_t globalsize[] = { dst.cols, dst.rows };
return k.run(2, globalsize, 0, false);
}
}
//////////////////////////////////////////////////////////////////////////////////////////
@ -2013,25 +2048,28 @@ void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
resizeArea_<double, double>, 0
};
Mat src = _src.getMat();
Size ssize = src.size();
Size ssize = _src.size();
CV_Assert( ssize.area() > 0 );
CV_Assert( dsize.area() || (inv_scale_x > 0 && inv_scale_y > 0) );
if( !dsize.area() )
{
dsize = Size(saturate_cast<int>(src.cols*inv_scale_x),
saturate_cast<int>(src.rows*inv_scale_y));
dsize = Size(saturate_cast<int>(ssize.width*inv_scale_x),
saturate_cast<int>(ssize.height*inv_scale_y));
CV_Assert( dsize.area() );
}
else
{
inv_scale_x = (double)dsize.width/src.cols;
inv_scale_y = (double)dsize.height/src.rows;
inv_scale_x = (double)dsize.width/ssize.width;
inv_scale_y = (double)dsize.height/ssize.height;
}
_dst.create(dsize, src.type());
Mat dst = _dst.getMat();
_dst.create(dsize, _src.type());
if( ocl::useOpenCL() && _dst.kind() == _InputArray::UMAT &&
ocl_resize(_src, _dst, inv_scale_x, inv_scale_y, interpolation) )
return;
Mat src = _src.getMat(), dst = _dst.getMat();
#ifdef HAVE_TEGRA_OPTIMIZATION
if (tegra::resize(src, dst, (float)inv_scale_x, (float)inv_scale_y, interpolation))

@ -0,0 +1,145 @@
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Rock Li, Rock.li@amd.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
__kernel void bilateral_C1_D0(__global uchar *dst,
__global const uchar *src,
const int dst_rows,
const int dst_cols,
const int maxk,
const int radius,
const int dst_step,
const int dst_offset,
const int src_step,
const int src_rows,
const int src_cols,
__constant float *color_weight,
__constant float *space_weight,
__constant int *space_ofs)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (y < dst_rows && x < dst_cols)
{
int src_index = mad24(y + radius, src_step, x + radius);
int dst_index = mad24(y, dst_step, x + dst_offset);
float sum = 0.f, wsum = 0.f;
int val0 = (int)src[src_index];
for(int k = 0; k < maxk; k++ )
{
int val = (int)src[src_index + space_ofs[k]];
float w = space_weight[k] * color_weight[abs(val - val0)];
sum += (float)(val) * w;
wsum += w;
}
dst[dst_index] = convert_uchar_rtz(sum / wsum + 0.5f);
}
}
__kernel void bilateral2_C1_D0(__global uchar *dst,
__global const uchar *src,
const int dst_rows,
const int dst_cols,
const int maxk,
const int radius,
const int dst_step,
const int dst_offset,
const int src_step,
const int src_rows,
const int src_cols,
__constant float *color_weight,
__constant float *space_weight,
__constant int *space_ofs)
{
int x = get_global_id(0) << 2;
int y = get_global_id(1);
if (y < dst_rows && x < dst_cols)
{
int src_index = mad24(y + radius, src_step, x + radius);
int dst_index = mad24(y, dst_step, x + dst_offset);
float4 sum = (float4)(0.f), wsum = (float4)(0.f);
int4 val0 = convert_int4(vload4(0,src + src_index));
for(int k = 0; k < maxk; k++ )
{
int4 val = convert_int4(vload4(0,src+src_index + space_ofs[k]));
float4 w = (float4)(space_weight[k]) * (float4)(color_weight[abs(val.x - val0.x)], color_weight[abs(val.y - val0.y)],
color_weight[abs(val.z - val0.z)], color_weight[abs(val.w - val0.w)]);
sum += convert_float4(val) * w;
wsum += w;
}
*(__global uchar4*)(dst+dst_index) = convert_uchar4_rtz(sum/wsum+0.5f);
}
}
__kernel void bilateral_C4_D0(__global uchar4 *dst,
__global const uchar4 *src,
const int dst_rows,
const int dst_cols,
const int maxk,
const int radius,
const int dst_step,
const int dst_offset,
const int src_step,
const int src_rows,
const int src_cols,
__constant float *color_weight,
__constant float *space_weight,
__constant int *space_ofs)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (y < dst_rows && x < dst_cols)
{
int src_index = mad24(y + radius, src_step, x + radius);
int dst_index = mad24(y, dst_step, x + dst_offset);
float4 sum = (float4)0.f;
float wsum = 0.f;
int4 val0 = convert_int4(src[src_index]);
for(int k = 0; k < maxk; k++ )
{
int4 val = convert_int4(src[src_index + space_ofs[k]]);
float w = space_weight[k] * color_weight[abs(val.x - val0.x) + abs(val.y - val0.y) + abs(val.z - val0.z)];
sum += convert_float4(val) * (float4)w;
wsum += w;
}
wsum = 1.f / wsum;
dst[dst_index] = convert_uchar4_rtz(sum * (float4)wsum + (float4)0.5f);
}
}

@ -0,0 +1,478 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Zhang Ying, zhangying913@gmail.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
///////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////Macro for border type////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////////////
#ifdef BORDER_REPLICATE
//BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (l_edge) : (i))
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (r_edge)-1 : (addr))
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) :(i))
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (b_edge)-1 :(addr))
#endif
#ifdef BORDER_REFLECT
//BORDER_REFLECT: fedcba|abcdefgh|hgfedcb
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i)-1 : (i))
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr))
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i)-1 : (i))
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr))
#endif
#ifdef BORDER_REFLECT_101
//BORDER_REFLECT_101: gfedcb|abcdefgh|gfedcba
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i) : (i))
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr))
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i) : (i))
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr))
#endif
//blur function does not support BORDER_WRAP
#ifdef BORDER_WRAP
//BORDER_WRAP: cdefgh|abcdefgh|abcdefg
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (i)+(r_edge) : (i))
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (i)-(r_edge) : (addr))
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (i)+(b_edge) : (i))
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (i)-(b_edge) : (addr))
#endif
#define THREADS 256
#define ELEM(i, l_edge, r_edge, elem1, elem2) (i) >= (l_edge) && (i) < (r_edge) ? (elem1) : (elem2)
inline void update_dst_C1_D0(__global uchar *dst, __local uint* temp,
int dst_rows, int dst_cols,
int dst_startX, int dst_x_off,
float alpha)
{
if(get_local_id(0) < anX || get_local_id(0) >= (THREADS-ksX+anX+1))
{
return;
}
uint4 tmp_sum = 0;
int posX = dst_startX - dst_x_off + (get_local_id(0)-anX)*4;
int posY = (get_group_id(1) << 1);
for(int i=-anX; i<=anX; i++)
{
tmp_sum += vload4(get_local_id(0), temp+i);
}
if(posY < dst_rows && posX < dst_cols)
{
tmp_sum /= (uint4) alpha;
if(posX >= 0 && posX < dst_cols)
*(dst) = tmp_sum.x;
if(posX+1 >= 0 && posX+1 < dst_cols)
*(dst + 1) = tmp_sum.y;
if(posX+2 >= 0 && posX+2 < dst_cols)
*(dst + 2) = tmp_sum.z;
if(posX+3 >= 0 && posX+3 < dst_cols)
*(dst + 3) = tmp_sum.w;
}
}
inline void update_dst_C4_D0(__global uchar4 *dst, __local uint4* temp,
int dst_rows, int dst_cols,
int dst_startX, int dst_x_off,
float alpha)
{
if(get_local_id(0) >= (THREADS-ksX+1))
{
return;
}
int posX = dst_startX - dst_x_off + get_local_id(0);
int posY = (get_group_id(1) << 1);
uint4 temp_sum = 0;
for(int i=-anX; i<=anX; i++)
{
temp_sum += temp[get_local_id(0) + anX + i];
}
if(posX >= 0 && posX < dst_cols && posY >= 0 && posY < dst_rows)
*dst = convert_uchar4(convert_float4(temp_sum)/alpha);
}
///////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////8uC1////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel void boxFilter_C1_D0(__global const uchar * restrict src, __global uchar *dst, float alpha,
int src_offset, int src_whole_rows, int src_whole_cols, int src_step,
int dst_offset, int dst_rows, int dst_cols, int dst_step
)
{
int col = get_local_id(0);
const int gX = get_group_id(0);
const int gY = get_group_id(1);
int src_x_off = src_offset % src_step;
int src_y_off = src_offset / src_step;
int dst_x_off = dst_offset % dst_step;
int dst_y_off = dst_offset / dst_step;
int head_off = dst_x_off%4;
int startX = ((gX * (THREADS-ksX+1)-anX) * 4) - head_off + src_x_off;
int startY = (gY << 1) - anY + src_y_off;
int dst_startX = (gX * (THREADS-ksX+1) * 4) - head_off + dst_x_off;
int dst_startY = (gY << 1) + dst_y_off;
uint4 data[ksY+1];
__local uint4 temp[2][THREADS];
#ifdef BORDER_CONSTANT
for(int i=0; i < ksY+1; i++)
{
if(startY+i >=0 && startY+i < src_whole_rows && startX+col*4 >=0 && startX+col*4+3<src_whole_cols)
{
data[i].x = *(src+(startY+i)*src_step + startX + col * 4);
data[i].y = *(src+(startY+i)*src_step + startX + col * 4 + 1);
data[i].z = *(src+(startY+i)*src_step + startX + col * 4 + 2);
data[i].w = *(src+(startY+i)*src_step + startX + col * 4 + 3);
}
else
{
data[i]=0;
int con = startY+i >=0 && startY+i < src_whole_rows && startX+col*4 >=0 && startX+col*4<src_whole_cols;
if(con)data[i].s0 = *(src+(startY+i)*src_step + startX + col*4);
con = startY+i >=0 && startY+i < src_whole_rows && startX+col*4+1 >=0 && startX+col*4+1<src_whole_cols;
if(con)data[i].s1 = *(src+(startY+i)*src_step + startX + col*4+1) ;
con = startY+i >=0 && startY+i < src_whole_rows && startX+col*4+2 >=0 && startX+col*4+2<src_whole_cols;
if(con)data[i].s2 = *(src+(startY+i)*src_step + startX + col*4+2);
con = startY+i >=0 && startY+i < src_whole_rows && startX+col*4+3 >=0 && startX+col*4+3<src_whole_cols;
if(con)data[i].s3 = *(src+(startY+i)*src_step + startX + col*4+3);
}
}
#else
int not_all_in_range;
for(int i=0; i < ksY+1; i++)
{
not_all_in_range = (startX+col*4<0) | (startX+col*4+3>src_whole_cols-1)
| (startY+i<0) | (startY+i>src_whole_rows-1);
if(not_all_in_range)
{
int selected_row;
int4 selected_col;
selected_row = ADDR_H(startY+i, 0, src_whole_rows);
selected_row = ADDR_B(startY+i, src_whole_rows, selected_row);
selected_col.x = ADDR_L(startX+col*4, 0, src_whole_cols);
selected_col.x = ADDR_R(startX+col*4, src_whole_cols, selected_col.x);
selected_col.y = ADDR_L(startX+col*4+1, 0, src_whole_cols);
selected_col.y = ADDR_R(startX+col*4+1, src_whole_cols, selected_col.y);
selected_col.z = ADDR_L(startX+col*4+2, 0, src_whole_cols);
selected_col.z = ADDR_R(startX+col*4+2, src_whole_cols, selected_col.z);
selected_col.w = ADDR_L(startX+col*4+3, 0, src_whole_cols);
selected_col.w = ADDR_R(startX+col*4+3, src_whole_cols, selected_col.w);
data[i].x = *(src + selected_row * src_step + selected_col.x);
data[i].y = *(src + selected_row * src_step + selected_col.y);
data[i].z = *(src + selected_row * src_step + selected_col.z);
data[i].w = *(src + selected_row * src_step + selected_col.w);
}
else
{
data[i] = convert_uint4(vload4(col,(__global uchar*)(src+(startY+i)*src_step + startX)));
}
}
#endif
uint4 tmp_sum = 0;
for(int i=1; i < ksY; i++)
{
tmp_sum += (data[i]);
}
int index = dst_startY * dst_step + dst_startX + (col-anX)*4;
temp[0][col] = tmp_sum + (data[0]);
temp[1][col] = tmp_sum + (data[ksY]);
barrier(CLK_LOCAL_MEM_FENCE);
update_dst_C1_D0(dst+index, (__local uint *)(temp[0]),
dst_rows, dst_cols, dst_startX, dst_x_off, alpha);
update_dst_C1_D0(dst+index+dst_step, (__local uint *)(temp[1]),
dst_rows, dst_cols, dst_startX, dst_x_off, alpha);
}
///////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////8uC4////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel void boxFilter_C4_D0(__global const uchar4 * restrict src, __global uchar4 *dst, float alpha,
int src_offset, int src_whole_rows, int src_whole_cols, int src_step,
int dst_offset, int dst_rows, int dst_cols, int dst_step
)
{
int col = get_local_id(0);
const int gX = get_group_id(0);
const int gY = get_group_id(1);
int src_x_off = (src_offset % src_step) >> 2;
int src_y_off = src_offset / src_step;
int dst_x_off = (dst_offset % dst_step) >> 2;
int dst_y_off = dst_offset / dst_step;
int startX = gX * (THREADS-ksX+1) - anX + src_x_off;
int startY = (gY << 1) - anY + src_y_off;
int dst_startX = gX * (THREADS-ksX+1) + dst_x_off;
int dst_startY = (gY << 1) + dst_y_off;
uint4 data[ksY+1];
__local uint4 temp[2][THREADS];
#ifdef BORDER_CONSTANT
bool con;
for(int i=0; i < ksY+1; i++)
{
con = startX+col >= 0 && startX+col < src_whole_cols && startY+i >= 0 && startY+i < src_whole_rows;
int cur_col = clamp(startX + col, 0, src_whole_cols);
data[i].x = con ? src[(startY+i)*(src_step>>2) + cur_col].x : 0;
data[i].y = con ? src[(startY+i)*(src_step>>2) + cur_col].y : 0;
data[i].z = con ? src[(startY+i)*(src_step>>2) + cur_col].z : 0;
data[i].w = con ? src[(startY+i)*(src_step>>2) + cur_col].w : 0;
}
#else
for(int i=0; i < ksY+1; i++)
{
int selected_row;
int selected_col;
selected_row = ADDR_H(startY+i, 0, src_whole_rows);
selected_row = ADDR_B(startY+i, src_whole_rows, selected_row);
selected_col = ADDR_L(startX+col, 0, src_whole_cols);
selected_col = ADDR_R(startX+col, src_whole_cols, selected_col);
data[i] = convert_uint4(src[selected_row * (src_step>>2) + selected_col]);
}
#endif
uint4 tmp_sum = 0;
for(int i=1; i < ksY; i++)
{
tmp_sum += (data[i]);
}
int index = dst_startY * (dst_step>>2)+ dst_startX + col;
temp[0][col] = tmp_sum + (data[0]);
temp[1][col] = tmp_sum + (data[ksY]);
barrier(CLK_LOCAL_MEM_FENCE);
update_dst_C4_D0(dst+index, (__local uint4 *)(temp[0]),
dst_rows, dst_cols, dst_startX, dst_x_off, alpha);
update_dst_C4_D0(dst+index+(dst_step>>2), (__local uint4 *)(temp[1]),
dst_rows, dst_cols, dst_startX, dst_x_off, alpha);
}
///////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////32fC1////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel void boxFilter_C1_D5(__global const float *restrict src, __global float *dst, float alpha,
int src_offset, int src_whole_rows, int src_whole_cols, int src_step,
int dst_offset, int dst_rows, int dst_cols, int dst_step
)
{
int col = get_local_id(0);
const int gX = get_group_id(0);
const int gY = get_group_id(1);
int src_x_off = (src_offset % src_step) >> 2;
int src_y_off = src_offset / src_step;
int dst_x_off = (dst_offset % dst_step) >> 2;
int dst_y_off = dst_offset / dst_step;
int startX = gX * (THREADS-ksX+1) - anX + src_x_off;
int startY = (gY << 1) - anY + src_y_off;
int dst_startX = gX * (THREADS-ksX+1) + dst_x_off;
int dst_startY = (gY << 1) + dst_y_off;
float data[ksY+1];
__local float temp[2][THREADS];
#ifdef BORDER_CONSTANT
bool con;
float ss;
for(int i=0; i < ksY+1; i++)
{
con = startX+col >= 0 && startX+col < src_whole_cols && startY+i >= 0 && startY+i < src_whole_rows;
int cur_col = clamp(startX + col, 0, src_whole_cols);
ss = (startY+i)<src_whole_rows&&(startY+i)>=0&&cur_col>=0&&cur_col<src_whole_cols?src[(startY+i)*(src_step>>2) + cur_col]:(float)0;
data[i] = con ? ss : 0.f;
}
#else
for(int i=0; i < ksY+1; i++)
{
int selected_row;
int selected_col;
selected_row = ADDR_H(startY+i, 0, src_whole_rows);
selected_row = ADDR_B(startY+i, src_whole_rows, selected_row);
selected_col = ADDR_L(startX+col, 0, src_whole_cols);
selected_col = ADDR_R(startX+col, src_whole_cols, selected_col);
data[i] = src[selected_row * (src_step>>2) + selected_col];
}
#endif
float sum0 = 0.0, sum1 = 0.0, sum2 = 0.0;
for(int i=1; i < ksY; i++)
{
sum0 += (data[i]);
}
sum1 = sum0 + (data[0]);
sum2 = sum0 + (data[ksY]);
temp[0][col] = sum1;
temp[1][col] = sum2;
barrier(CLK_LOCAL_MEM_FENCE);
if(col < (THREADS-(ksX-1)))
{
col += anX;
int posX = dst_startX - dst_x_off + col - anX;
int posY = (gY << 1);
float tmp_sum[2]= {0.0, 0.0};
for(int k=0; k<2; k++)
for(int i=-anX; i<=anX; i++)
{
tmp_sum[k] += temp[k][col+i];
}
for(int i=0; i<2; i++)
{
if(posX >= 0 && posX < dst_cols && (posY+i) >= 0 && (posY+i) < dst_rows)
dst[(dst_startY+i) * (dst_step>>2)+ dst_startX + col - anX] = tmp_sum[i]/alpha;
}
}
}
///////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////32fC4////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel void boxFilter_C4_D5(__global const float4 *restrict src, __global float4 *dst, float alpha,
int src_offset, int src_whole_rows, int src_whole_cols, int src_step,
int dst_offset, int dst_rows, int dst_cols, int dst_step
)
{
int col = get_local_id(0);
const int gX = get_group_id(0);
const int gY = get_group_id(1);
int src_x_off = (src_offset % src_step) >> 4;
int src_y_off = src_offset / src_step;
int dst_x_off = (dst_offset % dst_step) >> 4;
int dst_y_off = dst_offset / dst_step;
int startX = gX * (THREADS-ksX+1) - anX + src_x_off;
int startY = (gY << 1) - anY + src_y_off;
int dst_startX = gX * (THREADS-ksX+1) + dst_x_off;
int dst_startY = (gY << 1) + dst_y_off;
float4 data[ksY+1];
__local float4 temp[2][THREADS];
#ifdef BORDER_CONSTANT
bool con;
float4 ss;
for(int i=0; i < ksY+1; i++)
{
con = startX+col >= 0 && startX+col < src_whole_cols && startY+i >= 0 && startY+i < src_whole_rows;
int cur_col = clamp(startX + col, 0, src_whole_cols);
ss = (startY+i)<src_whole_rows&&(startY+i)>=0&&cur_col>=0&&cur_col<src_whole_cols?src[(startY+i)*(src_step>>4) + cur_col]:(float4)0;
data[i] = con ? ss : (float4)(0.0,0.0,0.0,0.0);
}
#else
for(int i=0; i < ksY+1; i++)
{
int selected_row;
int selected_col;
selected_row = ADDR_H(startY+i, 0, src_whole_rows);
selected_row = ADDR_B(startY+i, src_whole_rows, selected_row);
selected_col = ADDR_L(startX+col, 0, src_whole_cols);
selected_col = ADDR_R(startX+col, src_whole_cols, selected_col);
data[i] = src[selected_row * (src_step>>4) + selected_col];
}
#endif
float4 sum0 = 0.0, sum1 = 0.0, sum2 = 0.0;
for(int i=1; i < ksY; i++)
{
sum0 += (data[i]);
}
sum1 = sum0 + (data[0]);
sum2 = sum0 + (data[ksY]);
temp[0][col] = sum1;
temp[1][col] = sum2;
barrier(CLK_LOCAL_MEM_FENCE);
if(col < (THREADS-(ksX-1)))
{
col += anX;
int posX = dst_startX - dst_x_off + col - anX;
int posY = (gY << 1);
float4 tmp_sum[2]= {(float4)(0.0,0.0,0.0,0.0), (float4)(0.0,0.0,0.0,0.0)};
for(int k=0; k<2; k++)
for(int i=-anX; i<=anX; i++)
{
tmp_sum[k] += temp[k][col+i];
}
for(int i=0; i<2; i++)
{
if(posX >= 0 && posX < dst_cols && (posY+i) >= 0 && (posY+i) < dst_rows)
dst[(dst_startY+i) * (dst_step>>4)+ dst_startX + col - anX] = tmp_sum[i]/alpha;
}
}
}

@ -0,0 +1,636 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Peng Xiao, pengxiao@multicorewareinc.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
#ifdef L2GRAD
inline float calc(int x, int y)
{
return sqrt((float)(x * x + y * y));
}
#else
inline float calc(int x, int y)
{
return (float)abs(x) + abs(y);
}
#endif //
// Smoothing perpendicular to the derivative direction with a triangle filter
// only support 3x3 Sobel kernel
// h (-1) = 1, h (0) = 2, h (1) = 1
// h'(-1) = -1, h'(0) = 0, h'(1) = 1
// thus sobel 2D operator can be calculated as:
// h'(x, y) = h'(x)h(y) for x direction
//
// src input 8bit single channel image data
// dx_buf output dx buffer
// dy_buf output dy buffer
__kernel
void
__attribute__((reqd_work_group_size(16,16,1)))
calcSobelRowPass
(
__global const uchar * src,
__global int * dx_buf,
__global int * dy_buf,
int rows,
int cols,
int src_step,
int src_offset,
int dx_buf_step,
int dx_buf_offset,
int dy_buf_step,
int dy_buf_offset
)
{
dx_buf_step /= sizeof(*dx_buf);
dx_buf_offset /= sizeof(*dx_buf);
dy_buf_step /= sizeof(*dy_buf);
dy_buf_offset /= sizeof(*dy_buf);
int gidx = get_global_id(0);
int gidy = get_global_id(1);
int lidx = get_local_id(0);
int lidy = get_local_id(1);
__local int smem[16][18];
smem[lidy][lidx + 1] =
src[gidx + min(gidy, rows - 1) * src_step + src_offset];
if(lidx == 0)
{
smem[lidy][0] =
src[max(gidx - 1, 0) + min(gidy, rows - 1) * src_step + src_offset];
smem[lidy][17] =
src[min(gidx + 16, cols - 1) + min(gidy, rows - 1) * src_step + src_offset];
}
barrier(CLK_LOCAL_MEM_FENCE);
if(gidy < rows && gidx < cols)
{
dx_buf[gidx + gidy * dx_buf_step + dx_buf_offset] =
-smem[lidy][lidx] + smem[lidy][lidx + 2];
dy_buf[gidx + gidy * dy_buf_step + dy_buf_offset] =
smem[lidy][lidx] + 2 * smem[lidy][lidx + 1] + smem[lidy][lidx + 2];
}
}
// calculate the magnitude of the filter pass combining both x and y directions
// This is the buffered version(3x3 sobel)
//
// dx_buf dx buffer, calculated from calcSobelRowPass
// dy_buf dy buffer, calculated from calcSobelRowPass
// dx direvitive in x direction output
// dy direvitive in y direction output
// mag magnitude direvitive of xy output
__kernel
void
__attribute__((reqd_work_group_size(16,16,1)))
calcMagnitude_buf
(
__global const int * dx_buf,
__global const int * dy_buf,
__global int * dx,
__global int * dy,
__global float * mag,
int rows,
int cols,
int dx_buf_step,
int dx_buf_offset,
int dy_buf_step,
int dy_buf_offset,
int dx_step,
int dx_offset,
int dy_step,
int dy_offset,
int mag_step,
int mag_offset
)
{
dx_buf_step /= sizeof(*dx_buf);
dx_buf_offset /= sizeof(*dx_buf);
dy_buf_step /= sizeof(*dy_buf);
dy_buf_offset /= sizeof(*dy_buf);
dx_step /= sizeof(*dx);
dx_offset /= sizeof(*dx);
dy_step /= sizeof(*dy);
dy_offset /= sizeof(*dy);
mag_step /= sizeof(*mag);
mag_offset /= sizeof(*mag);
int gidx = get_global_id(0);
int gidy = get_global_id(1);
int lidx = get_local_id(0);
int lidy = get_local_id(1);
__local int sdx[18][16];
__local int sdy[18][16];
sdx[lidy + 1][lidx] =
dx_buf[gidx + min(gidy, rows - 1) * dx_buf_step + dx_buf_offset];
sdy[lidy + 1][lidx] =
dy_buf[gidx + min(gidy, rows - 1) * dy_buf_step + dy_buf_offset];
if(lidy == 0)
{
sdx[0][lidx] =
dx_buf[gidx + min(max(gidy-1,0),rows-1) * dx_buf_step + dx_buf_offset];
sdx[17][lidx] =
dx_buf[gidx + min(gidy + 16, rows - 1) * dx_buf_step + dx_buf_offset];
sdy[0][lidx] =
dy_buf[gidx + min(max(gidy-1,0),rows-1) * dy_buf_step + dy_buf_offset];
sdy[17][lidx] =
dy_buf[gidx + min(gidy + 16, rows - 1) * dy_buf_step + dy_buf_offset];
}
barrier(CLK_LOCAL_MEM_FENCE);
if(gidx < cols && gidy < rows)
{
int x = sdx[lidy][lidx] + 2 * sdx[lidy + 1][lidx] + sdx[lidy + 2][lidx];
int y = -sdy[lidy][lidx] + sdy[lidy + 2][lidx];
dx[gidx + gidy * dx_step + dx_offset] = x;
dy[gidx + gidy * dy_step + dy_offset] = y;
mag[(gidx + 1) + (gidy + 1) * mag_step + mag_offset] = calc(x, y);
}
}
// calculate the magnitude of the filter pass combining both x and y directions
// This is the non-buffered version(non-3x3 sobel)
//
// dx_buf dx buffer, calculated from calcSobelRowPass
// dy_buf dy buffer, calculated from calcSobelRowPass
// dx direvitive in x direction output
// dy direvitive in y direction output
// mag magnitude direvitive of xy output
__kernel
void calcMagnitude
(
__global const int * dx,
__global const int * dy,
__global float * mag,
int rows,
int cols,
int dx_step,
int dx_offset,
int dy_step,
int dy_offset,
int mag_step,
int mag_offset
)
{
dx_step /= sizeof(*dx);
dx_offset /= sizeof(*dx);
dy_step /= sizeof(*dy);
dy_offset /= sizeof(*dy);
mag_step /= sizeof(*mag);
mag_offset /= sizeof(*mag);
int gidx = get_global_id(0);
int gidy = get_global_id(1);
if(gidy < rows && gidx < cols)
{
mag[(gidx + 1) + (gidy + 1) * mag_step + mag_offset] =
calc(
dx[gidx + gidy * dx_step + dx_offset],
dy[gidx + gidy * dy_step + dy_offset]
);
}
}
//////////////////////////////////////////////////////////////////////////////////////////
// 0.4142135623730950488016887242097 is tan(22.5)
#define CANNY_SHIFT 15
#define TG22 (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5)
//First pass of edge detection and non-maximum suppression
// edgetype is set to for each pixel:
// 0 - below low thres, not an edge
// 1 - maybe an edge
// 2 - is an edge, either magnitude is greater than high thres, or
// Given estimates of the image gradients, a search is then carried out
// to determine if the gradient magnitude assumes a local maximum in the gradient direction.
// if the rounded gradient angle is zero degrees (i.e. the edge is in the north-south direction) the point will be considered to be on the edge if its gradient magnitude is greater than the magnitudes in the west and east directions,
// if the rounded gradient angle is 90 degrees (i.e. the edge is in the east-west direction) the point will be considered to be on the edge if its gradient magnitude is greater than the magnitudes in the north and south directions,
// if the rounded gradient angle is 135 degrees (i.e. the edge is in the north east-south west direction) the point will be considered to be on the edge if its gradient magnitude is greater than the magnitudes in the north west and south east directions,
// if the rounded gradient angle is 45 degrees (i.e. the edge is in the north west-south east direction)the point will be considered to be on the edge if its gradient magnitude is greater than the magnitudes in the north east and south west directions.
//
// dx, dy direvitives of x and y direction
// mag magnitudes calculated from calcMagnitude function
// map output containing raw edge types
__kernel
void
__attribute__((reqd_work_group_size(16,16,1)))
calcMap
(
__global const int * dx,
__global const int * dy,
__global const float * mag,
__global int * map,
int rows,
int cols,
float low_thresh,
float high_thresh,
int dx_step,
int dx_offset,
int dy_step,
int dy_offset,
int mag_step,
int mag_offset,
int map_step,
int map_offset
)
{
dx_step /= sizeof(*dx);
dx_offset /= sizeof(*dx);
dy_step /= sizeof(*dy);
dy_offset /= sizeof(*dy);
mag_step /= sizeof(*mag);
mag_offset /= sizeof(*mag);
map_step /= sizeof(*map);
map_offset /= sizeof(*map);
mag += mag_offset;
map += map_offset;
__local float smem[18][18];
int gidx = get_global_id(0);
int gidy = get_global_id(1);
int lidx = get_local_id(0);
int lidy = get_local_id(1);
int grp_idx = get_global_id(0) & 0xFFFFF0;
int grp_idy = get_global_id(1) & 0xFFFFF0;
int tid = lidx + lidy * 16;
int lx = tid % 18;
int ly = tid / 18;
if(ly < 14)
{
smem[ly][lx] =
mag[grp_idx + lx + min(grp_idy + ly, rows - 1) * mag_step];
}
if(ly < 4 && grp_idy + ly + 14 <= rows && grp_idx + lx <= cols)
{
smem[ly + 14][lx] =
mag[grp_idx + lx + min(grp_idy + ly + 14, rows -1) * mag_step];
}
barrier(CLK_LOCAL_MEM_FENCE);
if(gidy < rows && gidx < cols)
{
int x = dx[gidx + gidy * dx_step];
int y = dy[gidx + gidy * dy_step];
const int s = (x ^ y) < 0 ? -1 : 1;
const float m = smem[lidy + 1][lidx + 1];
x = abs(x);
y = abs(y);
// 0 - the pixel can not belong to an edge
// 1 - the pixel might belong to an edge
// 2 - the pixel does belong to an edge
int edge_type = 0;
if(m > low_thresh)
{
const int tg22x = x * TG22;
const int tg67x = tg22x + (x << (1 + CANNY_SHIFT));
y <<= CANNY_SHIFT;
if(y < tg22x)
{
if(m > smem[lidy + 1][lidx] && m >= smem[lidy + 1][lidx + 2])
{
edge_type = 1 + (int)(m > high_thresh);
}
}
else if (y > tg67x)
{
if(m > smem[lidy][lidx + 1]&& m >= smem[lidy + 2][lidx + 1])
{
edge_type = 1 + (int)(m > high_thresh);
}
}
else
{
if(m > smem[lidy][lidx + 1 - s]&& m > smem[lidy + 2][lidx + 1 + s])
{
edge_type = 1 + (int)(m > high_thresh);
}
}
}
map[gidx + 1 + (gidy + 1) * map_step] = edge_type;
}
}
#undef CANNY_SHIFT
#undef TG22
//////////////////////////////////////////////////////////////////////////////////////////
// do Hysteresis for pixel whose edge type is 1
//
// If candidate pixel (edge type is 1) has a neighbour pixel (in 3x3 area) with type 2, it is believed to be part of an edge and
// marked as edge. Each thread will iterate for 16 times to connect local edges.
// Candidate pixel being identified as edge will then be tested if there is nearby potiential edge points. If there is, counter will
// be incremented by 1 and the point location is stored. These potiential candidates will be processed further in next kernel.
//
// map raw edge type results calculated from calcMap.
// st the potiential edge points found in this kernel call
// counter the number of potiential edge points
__kernel
void
__attribute__((reqd_work_group_size(16,16,1)))
edgesHysteresisLocal
(
__global int * map,
__global ushort2 * st,
__global unsigned int * counter,
int rows,
int cols,
int map_step,
int map_offset
)
{
map_step /= sizeof(*map);
map_offset /= sizeof(*map);
map += map_offset;
__local int smem[18][18];
int gidx = get_global_id(0);
int gidy = get_global_id(1);
int lidx = get_local_id(0);
int lidy = get_local_id(1);
int grp_idx = get_global_id(0) & 0xFFFFF0;
int grp_idy = get_global_id(1) & 0xFFFFF0;
int tid = lidx + lidy * 16;
int lx = tid % 18;
int ly = tid / 18;
if(ly < 14)
{
smem[ly][lx] =
map[grp_idx + lx + min(grp_idy + ly, rows - 1) * map_step];
}
if(ly < 4 && grp_idy + ly + 14 <= rows && grp_idx + lx <= cols)
{
smem[ly + 14][lx] =
map[grp_idx + lx + min(grp_idy + ly + 14, rows - 1) * map_step];
}
barrier(CLK_LOCAL_MEM_FENCE);
if(gidy < rows && gidx < cols)
{
int n;
#pragma unroll
for (int k = 0; k < 16; ++k)
{
n = 0;
if (smem[lidy + 1][lidx + 1] == 1)
{
n += smem[lidy ][lidx ] == 2;
n += smem[lidy ][lidx + 1] == 2;
n += smem[lidy ][lidx + 2] == 2;
n += smem[lidy + 1][lidx ] == 2;
n += smem[lidy + 1][lidx + 2] == 2;
n += smem[lidy + 2][lidx ] == 2;
n += smem[lidy + 2][lidx + 1] == 2;
n += smem[lidy + 2][lidx + 2] == 2;
}
if (n > 0)
smem[lidy + 1][lidx + 1] = 2;
}
const int e = smem[lidy + 1][lidx + 1];
map[gidx + 1 + (gidy + 1) * map_step] = e;
n = 0;
if(e == 2)
{
n += smem[lidy ][lidx ] == 1;
n += smem[lidy ][lidx + 1] == 1;
n += smem[lidy ][lidx + 2] == 1;
n += smem[lidy + 1][lidx ] == 1;
n += smem[lidy + 1][lidx + 2] == 1;
n += smem[lidy + 2][lidx ] == 1;
n += smem[lidy + 2][lidx + 1] == 1;
n += smem[lidy + 2][lidx + 2] == 1;
}
if(n > 0)
{
unsigned int ind = atomic_inc(counter);
st[ind] = (ushort2)(gidx + 1, gidy + 1);
}
}
}
__constant int c_dx[8] = {-1, 0, 1, -1, 1, -1, 0, 1};
__constant int c_dy[8] = {-1, -1, -1, 0, 0, 1, 1, 1};
#define stack_size 512
__kernel
void
__attribute__((reqd_work_group_size(128,1,1)))
edgesHysteresisGlobal
(
__global int * map,
__global ushort2 * st1,
__global ushort2 * st2,
__global int * counter,
int rows,
int cols,
int count,
int map_step,
int map_offset
)
{
map_step /= sizeof(*map);
map_offset /= sizeof(*map);
map += map_offset;
int gidx = get_global_id(0);
int gidy = get_global_id(1);
int lidx = get_local_id(0);
int lidy = get_local_id(1);
int grp_idx = get_group_id(0);
int grp_idy = get_group_id(1);
__local unsigned int s_counter;
__local unsigned int s_ind;
__local ushort2 s_st[stack_size];
if(lidx == 0)
{
s_counter = 0;
}
barrier(CLK_LOCAL_MEM_FENCE);
int ind = mad24(grp_idy, (int)get_local_size(0), grp_idx);
if(ind < count)
{
ushort2 pos = st1[ind];
if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)
{
if (lidx < 8)
{
pos.x += c_dx[lidx];
pos.y += c_dy[lidx];
if (map[pos.x + pos.y * map_step] == 1)
{
map[pos.x + pos.y * map_step] = 2;
ind = atomic_inc(&s_counter);
s_st[ind] = pos;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
while (s_counter > 0 && s_counter <= stack_size - get_local_size(0))
{
const int subTaskIdx = lidx >> 3;
const int portion = min(s_counter, (uint)(get_local_size(0)>> 3));
pos.x = pos.y = 0;
if (subTaskIdx < portion)
pos = s_st[s_counter - 1 - subTaskIdx];
barrier(CLK_LOCAL_MEM_FENCE);
if (lidx == 0)
s_counter -= portion;
barrier(CLK_LOCAL_MEM_FENCE);
if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)
{
pos.x += c_dx[lidx & 7];
pos.y += c_dy[lidx & 7];
if (map[pos.x + pos.y * map_step] == 1)
{
map[pos.x + pos.y * map_step] = 2;
ind = atomic_inc(&s_counter);
s_st[ind] = pos;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (s_counter > 0)
{
if (lidx == 0)
{
ind = atomic_add(counter, s_counter);
s_ind = ind - s_counter;
}
barrier(CLK_LOCAL_MEM_FENCE);
ind = s_ind;
for (int i = lidx; i < s_counter; i += get_local_size(0))
{
st2[ind + i] = s_st[i];
}
}
}
}
}
#undef stack_size
//Get the edge result. egde type of value 2 will be marked as an edge point and set to 255. Otherwise 0.
// map edge type mappings
// dst edge output
__kernel
void getEdges
(
__global const int * map,
__global uchar * dst,
int rows,
int cols,
int map_step,
int map_offset,
int dst_step,
int dst_offset
)
{
map_step /= sizeof(*map);
map_offset /= sizeof(*map);
int gidx = get_global_id(0);
int gidy = get_global_id(1);
if(gidy < rows && gidx < cols)
{
dst[gidx + gidy * dst_step] = (uchar)(-(map[gidx + 1 + (gidy + 1) * map_step + map_offset] >> 1));
}
}

@ -0,0 +1,255 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Sen Liu, swjtuls1987@126.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef WAVE_SIZE
#define WAVE_SIZE 1
#endif
int calc_lut(__local int* smem, int val, int tid)
{
smem[tid] = val;
barrier(CLK_LOCAL_MEM_FENCE);
if (tid == 0)
for (int i = 1; i < 256; ++i)
smem[i] += smem[i - 1];
barrier(CLK_LOCAL_MEM_FENCE);
return smem[tid];
}
#ifdef CPU
void reduce(volatile __local int* smem, int val, int tid)
{
smem[tid] = val;
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 128)
smem[tid] = val += smem[tid + 128];
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 64)
smem[tid] = val += smem[tid + 64];
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 32)
smem[tid] += smem[tid + 32];
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 16)
smem[tid] += smem[tid + 16];
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 8)
smem[tid] += smem[tid + 8];
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 4)
smem[tid] += smem[tid + 4];
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 2)
smem[tid] += smem[tid + 2];
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 1)
smem[256] = smem[tid] + smem[tid + 1];
barrier(CLK_LOCAL_MEM_FENCE);
}
#else
void reduce(__local volatile int* smem, int val, int tid)
{
smem[tid] = val;
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 128)
smem[tid] = val += smem[tid + 128];
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 64)
smem[tid] = val += smem[tid + 64];
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 32)
{
smem[tid] += smem[tid + 32];
#if WAVE_SIZE < 32
} barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 16)
{
#endif
smem[tid] += smem[tid + 16];
#if WAVE_SIZE < 16
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 8)
{
#endif
smem[tid] += smem[tid + 8];
smem[tid] += smem[tid + 4];
smem[tid] += smem[tid + 2];
smem[tid] += smem[tid + 1];
}
}
#endif
__kernel void calcLut(__global __const uchar * src, __global uchar * lut,
const int srcStep, const int dstStep,
const int2 tileSize, const int tilesX,
const int clipLimit, const float lutScale,
const int src_offset, const int dst_offset)
{
__local int smem[512];
const int tx = get_group_id(0);
const int ty = get_group_id(1);
const unsigned int tid = get_local_id(1) * get_local_size(0)
+ get_local_id(0);
smem[tid] = 0;
barrier(CLK_LOCAL_MEM_FENCE);
for (int i = get_local_id(1); i < tileSize.y; i += get_local_size(1))
{
__global const uchar* srcPtr = src + mad24(ty * tileSize.y + i, srcStep, tx * tileSize.x + src_offset);
for (int j = get_local_id(0); j < tileSize.x; j += get_local_size(0))
{
const int data = srcPtr[j];
atomic_inc(&smem[data]);
}
}
barrier(CLK_LOCAL_MEM_FENCE);
int tHistVal = smem[tid];
barrier(CLK_LOCAL_MEM_FENCE);
if (clipLimit > 0)
{
// clip histogram bar
int clipped = 0;
if (tHistVal > clipLimit)
{
clipped = tHistVal - clipLimit;
tHistVal = clipLimit;
}
// find number of overall clipped samples
reduce(smem, clipped, tid);
barrier(CLK_LOCAL_MEM_FENCE);
#ifdef CPU
clipped = smem[256];
#else
clipped = smem[0];
#endif
// broadcast evaluated value
__local int totalClipped;
if (tid == 0)
totalClipped = clipped;
barrier(CLK_LOCAL_MEM_FENCE);
// redistribute clipped samples evenly
int redistBatch = totalClipped / 256;
tHistVal += redistBatch;
int residual = totalClipped - redistBatch * 256;
if (tid < residual)
++tHistVal;
}
const int lutVal = calc_lut(smem, tHistVal, tid);
uint ires = (uint)convert_int_rte(lutScale * lutVal);
lut[(ty * tilesX + tx) * dstStep + tid + dst_offset] =
convert_uchar(clamp(ires, (uint)0, (uint)255));
}
__kernel void transform(__global __const uchar * src,
__global uchar * dst,
__global uchar * lut,
const int srcStep, const int dstStep, const int lutStep,
const int cols, const int rows,
const int2 tileSize,
const int tilesX, const int tilesY,
const int src_offset, const int dst_offset, int lut_offset)
{
const int x = get_global_id(0);
const int y = get_global_id(1);
if (x >= cols || y >= rows)
return;
const float tyf = (convert_float(y) / tileSize.y) - 0.5f;
int ty1 = convert_int_rtn(tyf);
int ty2 = ty1 + 1;
const float ya = tyf - ty1;
ty1 = max(ty1, 0);
ty2 = min(ty2, tilesY - 1);
const float txf = (convert_float(x) / tileSize.x) - 0.5f;
int tx1 = convert_int_rtn(txf);
int tx2 = tx1 + 1;
const float xa = txf - tx1;
tx1 = max(tx1, 0);
tx2 = min(tx2, tilesX - 1);
const int srcVal = src[mad24(y, srcStep, x + src_offset)];
float res = 0;
res += lut[mad24(ty1 * tilesX + tx1, lutStep, srcVal + lut_offset)] * ((1.0f - xa) * (1.0f - ya));
res += lut[mad24(ty1 * tilesX + tx2, lutStep, srcVal + lut_offset)] * ((xa) * (1.0f - ya));
res += lut[mad24(ty2 * tilesX + tx1, lutStep, srcVal + lut_offset)] * ((1.0f - xa) * (ya));
res += lut[mad24(ty2 * tilesX + tx2, lutStep, srcVal + lut_offset)] * ((xa) * (ya));
uint ires = (uint)convert_int_rte(res);
dst[mad24(y, dstStep, x + dst_offset)] = convert_uchar(clamp(ires, (uint)0, (uint)255));
}

@ -0,0 +1,109 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Jiang Liyuan, jlyuan001.good@163.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if defined (__ATI__)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#elif defined (__NVIDIA__)
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#endif
/************************************** convolve **************************************/
__kernel void convolve_D5(__global float *src, __global float *temp1, __global float *dst,
int rows, int cols, int src_step, int dst_step,int k_step, int kWidth, int kHeight,
int src_offset, int dst_offset, int koffset)
{
__local float smem[16 + 2 * 8][16 + 2 * 8];
int x = get_local_id(0);
int y = get_local_id(1);
int gx = get_global_id(0);
int gy = get_global_id(1);
// x | x 0 | 0
// -----------
// x | x 0 | 0
// 0 | 0 0 | 0
// -----------
// 0 | 0 0 | 0
smem[y][x] = src[min(max(gy - 8, 0), rows - 1) * src_step + min(max(gx - 8, 0), cols - 1) + src_offset];
// 0 | 0 x | x
// -----------
// 0 | 0 x | x
// 0 | 0 0 | 0
// -----------
// 0 | 0 0 | 0
smem[y][x + 16] = src[min(max(gy - 8, 0), rows - 1) * src_step + min(gx + 8, cols - 1) + src_offset];
// 0 | 0 0 | 0
// -----------
// 0 | 0 0 | 0
// x | x 0 | 0
// -----------
// x | x 0 | 0
smem[y + 16][x] = src[min(gy + 8, rows - 1) * src_step + min(max(gx - 8, 0), cols - 1) + src_offset];
// 0 | 0 0 | 0
// -----------
// 0 | 0 0 | 0
// 0 | 0 x | x
// -----------
// 0 | 0 x | x
smem[y + 16][x + 16] = src[min(gy + 8, rows - 1) * src_step + min(gx + 8, cols - 1) + src_offset];
barrier(CLK_LOCAL_MEM_FENCE);
if (gx < cols && gy < rows)
{
float res = 0;
for (int i = 0; i < kHeight; ++i)
for (int j = 0; j < kWidth; ++j)
res += smem[y + 8 - kHeight / 2 + i][x + 8 - kWidth / 2 + j] * temp1[i * k_step + j + koffset];
dst[gy * dst_step + gx + dst_offset] = res;
}
}

@ -0,0 +1,134 @@
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Niko Li, newlife20080214@gmail.com
// Zero Lin zero.lin@amd.com
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//
#if defined (DOUBLE_SUPPORT)
#ifdef cl_amd_fp64
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#elif defined (cl_khr_fp64)
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#endif
#endif
#ifdef BORDER_CONSTANT
#define EXTRAPOLATE(x, y, v) v = scalar;
#elif defined BORDER_REPLICATE
#define EXTRAPOLATE(x, y, v) \
{ \
x = max(min(x, src_cols - 1), 0); \
y = max(min(y, src_rows - 1), 0); \
v = src[mad24(y, src_step, x + src_offset)]; \
}
#elif defined BORDER_WRAP
#define EXTRAPOLATE(x, y, v) \
{ \
if (x < 0) \
x -= ((x - src_cols + 1) / src_cols) * src_cols; \
if (x >= src_cols) \
x %= src_cols; \
\
if (y < 0) \
y -= ((y - src_rows + 1) / src_rows) * src_rows; \
if( y >= src_rows ) \
y %= src_rows; \
v = src[mad24(y, src_step, x + src_offset)]; \
}
#elif defined(BORDER_REFLECT) || defined(BORDER_REFLECT_101)
#ifdef BORDER_REFLECT
#define DELTA int delta = 0
#else
#define DELTA int delta = 1
#endif
#define EXTRAPOLATE(x, y, v) \
{ \
DELTA; \
if (src_cols == 1) \
x = 0; \
else \
do \
{ \
if( x < 0 ) \
x = -x - 1 + delta; \
else \
x = src_cols - 1 - (x - src_cols) - delta; \
} \
while (x >= src_cols || x < 0); \
\
if (src_rows == 1) \
y = 0; \
else \
do \
{ \
if( y < 0 ) \
y = -y - 1 + delta; \
else \
y = src_rows - 1 - (y - src_rows) - delta; \
} \
while (y >= src_rows || y < 0); \
v = src[mad24(y, src_step, x + src_offset)]; \
}
#else
#error No extrapolation method
#endif
#define NEED_EXTRAPOLATION(gx, gy) (gx >= src_cols || gy >= src_rows || gx < 0 || gy < 0)
__kernel void copymakeborder
(__global const GENTYPE *src,
__global GENTYPE *dst,
int dst_cols, int dst_rows,
int src_cols, int src_rows,
int src_step, int src_offset,
int dst_step, int dst_offset,
int top, int left, GENTYPE scalar)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < dst_cols && y < dst_rows)
{
int src_x = x - left;
int src_y = y - top;
int dst_index = mad24(y, dst_step, x + dst_offset);
if (NEED_EXTRAPOLATION(src_x, src_y))
EXTRAPOLATE(src_x, src_y, dst[dst_index])
else
{
int src_index = mad24(src_y, src_step, src_x + src_offset);
dst[dst_index] = src[src_index];
}
}
}

@ -0,0 +1,306 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Jia Haipeng, jiahaipeng95@gmail.com
// Peng Xiao, pengxiao@multicorewareinc.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
/**************************************PUBLICFUNC*************************************/
#if defined (DOUBLE_SUPPORT)
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#endif
#if depth == 0
#define DATA_TYPE uchar
#define MAX_NUM 255
#define HALF_MAX 128
#define SAT_CAST(num) convert_uchar_sat(num)
#define DEPTH_0
#elif depth == 2
#define DATA_TYPE ushort
#define MAX_NUM 65535
#define HALF_MAX 32768
#define SAT_CAST(num) convert_ushort_sat(num)
#define DEPTH_2
#elif depth == 5
#define DATA_TYPE float
#define MAX_NUM 1.0f
#define HALF_MAX 0.5f
#define SAT_CAST(num) (num)
#define DEPTH_5
#else
#error "invalid depth: should be 0 (CV_8U), 2 (CV_16U) or 5 (CV_32F)"
#endif
#define CV_DESCALE(x,n) (((x) + (1 << ((n)-1))) >> (n))
enum
{
yuv_shift = 14,
xyz_shift = 12,
R2Y = 4899,
G2Y = 9617,
B2Y = 1868,
BLOCK_SIZE = 256
};
#define scnbytes ((int)sizeof(DATA_TYPE)*scn)
#define dcnbytes ((int)sizeof(DATA_TYPE)*dcn)
///////////////////////////////////// RGB <-> GRAY //////////////////////////////////////
__kernel void RGB2Gray(__global const uchar* srcptr, int srcstep, int srcoffset,
__global uchar* dstptr, int dststep, int dstoffset,
int rows, int cols)
{
const int x = get_global_id(0);
const int y = get_global_id(1);
if (y < rows && x < cols)
{
const DATA_TYPE* src = (const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes));
DATA_TYPE* dst = (DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes));
#if defined (DEPTH_5)
dst[0] = src[bidx] * 0.114f + src[1] * 0.587f + src[(bidx^2)] * 0.299f;
#else
dst[0] = (DATA_TYPE)CV_DESCALE((src[bidx] * B2Y + src[1] * G2Y + src[(bidx^2)] * R2Y), yuv_shift);
#endif
}
}
__kernel void Gray2RGB(__global const uchar* srcptr, int srcstep, int srcoffset,
__global uchar* dstptr, int dststep, int dstoffset,
int rows, int cols)
{
const int x = get_global_id(0);
const int y = get_global_id(1);
if (y < rows && x < cols)
{
const DATA_TYPE* src = (const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes));
DATA_TYPE* dst = (DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes));
DATA_TYPE val = src[0];
dst[0] = dst[1] = dst[2] = val;
#if dcn == 4
dst[3] = MAX_NUM;
#endif
}
}
///////////////////////////////////// RGB <-> YUV //////////////////////////////////////
__constant float c_RGB2YUVCoeffs_f[5] = { 0.114f, 0.587f, 0.299f, 0.492f, 0.877f };
__constant int c_RGB2YUVCoeffs_i[5] = { B2Y, G2Y, R2Y, 8061, 14369 };
__kernel void RGB2YUV(__global const uchar* srcptr, int srcstep, int srcoffset,
__global uchar* dstptr, int dststep, int dstoffset,
int rows, int cols)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (y < rows && x < cols)
{
const DATA_TYPE* src = (const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes));
DATA_TYPE* dst = (DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes));
DATA_TYPE b=src[bidx], g=src[1], r=src[bidx^2];
#if defined (DEPTH_5)
__constant float * coeffs = c_RGB2YUVCoeffs_f;
const DATA_TYPE Y = b * coeffs[0] + g * coeffs[1] + r * coeffs[2];
const DATA_TYPE U = (b - Y) * coeffs[3] + HALF_MAX;
const DATA_TYPE V = (r - Y) * coeffs[4] + HALF_MAX;
#else
__constant int * coeffs = c_RGB2YUVCoeffs_i;
const int delta = HALF_MAX * (1 << yuv_shift);
const int Y = CV_DESCALE(b * coeffs[0] + g * coeffs[1] + r * coeffs[2], yuv_shift);
const int U = CV_DESCALE((b - Y) * coeffs[3] + delta, yuv_shift);
const int V = CV_DESCALE((r - Y) * coeffs[4] + delta, yuv_shift);
#endif
dst[0] = SAT_CAST( Y );
dst[1] = SAT_CAST( U );
dst[2] = SAT_CAST( V );
}
}
__constant float c_YUV2RGBCoeffs_f[5] = { 2.032f, -0.395f, -0.581f, 1.140f };
__constant int c_YUV2RGBCoeffs_i[5] = { 33292, -6472, -9519, 18678 };
__kernel void YUV2RGB(__global const uchar* srcptr, int srcstep, int srcoffset,
__global uchar* dstptr, int dststep, int dstoffset,
int rows, int cols)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (y < rows && x < cols)
{
const DATA_TYPE* src = (const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes));
DATA_TYPE* dst = (DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes));
DATA_TYPE Y = src[0], U = src[1], V = src[2];
#if defined (DEPTH_5)
__constant float * coeffs = c_YUV2RGBCoeffs_f;
const float r = Y + (V - HALF_MAX) * coeffs[3];
const float g = Y + (V - HALF_MAX) * coeffs[2] + (U - HALF_MAX) * coeffs[1];
const float b = Y + (U - HALF_MAX) * coeffs[0];
#else
__constant int * coeffs = c_YUV2RGBCoeffs_i;
const int r = Y + CV_DESCALE((V - HALF_MAX) * coeffs[3], yuv_shift);
const int g = Y + CV_DESCALE((V - HALF_MAX) * coeffs[2] + (U - HALF_MAX) * coeffs[1], yuv_shift);
const int b = Y + CV_DESCALE((U - HALF_MAX) * coeffs[0], yuv_shift);
#endif
dst[bidx] = SAT_CAST( b );
dst[1] = SAT_CAST( g );
dst[bidx^2] = SAT_CAST( r );
#if dcn == 4
dst[3] = MAX_NUM;
#endif
}
}
__constant int ITUR_BT_601_CY = 1220542;
__constant int ITUR_BT_601_CUB = 2116026;
__constant int ITUR_BT_601_CUG = 409993;
__constant int ITUR_BT_601_CVG = 852492;
__constant int ITUR_BT_601_CVR = 1673527;
__constant int ITUR_BT_601_SHIFT = 20;
__kernel void YUV2RGBA_NV12(__global const uchar* srcptr, int srcstep, int srcoffset,
__global uchar* dstptr, int dststep, int dstoffset,
int rows, int cols)
{
const int x = get_global_id(0); // max_x = width / 2
const int y = get_global_id(1); // max_y = height/ 2
if (y < rows / 2 && x < cols / 2 )
{
__global const uchar* ysrc = srcptr + mad24(y << 1, srcstep, (x << 1) + srcoffset);
__global const uchar* usrc = srcptr + mad24(rows + y, srcstep, (x << 1) + srcoffset);
__global uchar* dst1 = dstptr + mad24(y << 1, dststep, x*(dcn*2) + dstoffset);
__global uchar* dst2 = dstptr + mad24((y << 1) + 1, dststep, x*(dcn*2) + dstoffset);
int Y1 = ysrc[0];
int Y2 = ysrc[1];
int Y3 = ysrc[srcstep];
int Y4 = ysrc[srcstep + 1];
int U = usrc[0] - 128;
int V = usrc[1] - 128;
int ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * V;
int guv = (1 << (ITUR_BT_601_SHIFT - 1)) - ITUR_BT_601_CVG * V - ITUR_BT_601_CUG * U;
int buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * U;
Y1 = max(0, Y1 - 16) * ITUR_BT_601_CY;
dst1[2 - bidx] = convert_uchar_sat((Y1 + ruv) >> ITUR_BT_601_SHIFT);
dst1[1] = convert_uchar_sat((Y1 + guv) >> ITUR_BT_601_SHIFT);
dst1[bidx] = convert_uchar_sat((Y1 + buv) >> ITUR_BT_601_SHIFT);
#if dcn == 4
dst1[3] = 255;
#endif
Y2 = max(0, Y2 - 16) * ITUR_BT_601_CY;
dst1[(dcn + 2) - bidx] = convert_uchar_sat((Y2 + ruv) >> ITUR_BT_601_SHIFT);
dst1[dcn + 1] = convert_uchar_sat((Y2 + guv) >> ITUR_BT_601_SHIFT);
dst1[dcn + bidx] = convert_uchar_sat((Y2 + buv) >> ITUR_BT_601_SHIFT);
#if dcn == 4
dst1[7] = 255;
#endif
Y3 = max(0, Y3 - 16) * ITUR_BT_601_CY;
dst2[2 - bidx] = convert_uchar_sat((Y3 + ruv) >> ITUR_BT_601_SHIFT);
dst2[1] = convert_uchar_sat((Y3 + guv) >> ITUR_BT_601_SHIFT);
dst2[bidx] = convert_uchar_sat((Y3 + buv) >> ITUR_BT_601_SHIFT);
#if dcn == 4
dst2[3] = 255;
#endif
Y4 = max(0, Y4 - 16) * ITUR_BT_601_CY;
dst2[(dcn + 2) - bidx] = convert_uchar_sat((Y4 + ruv) >> ITUR_BT_601_SHIFT);
dst2[dcn + 1] = convert_uchar_sat((Y4 + guv) >> ITUR_BT_601_SHIFT);
dst2[dcn + bidx] = convert_uchar_sat((Y4 + buv) >> ITUR_BT_601_SHIFT);
#if dcn == 4
dst2[7] = 255;
#endif
}
}
///////////////////////////////////// RGB <-> YUV //////////////////////////////////////
__constant float c_RGB2YCrCbCoeffs_f[5] = {0.299f, 0.587f, 0.114f, 0.713f, 0.564f};
__constant int c_RGB2YCrCbCoeffs_i[5] = {R2Y, G2Y, B2Y, 11682, 9241};
__kernel void RGB2YCrCb(__global const uchar* srcptr, int srcstep, int srcoffset,
__global uchar* dstptr, int dststep, int dstoffset,
int rows, int cols)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (y < rows && x < cols)
{
const DATA_TYPE* src = (const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes));
DATA_TYPE* dst = (DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes));
DATA_TYPE b=src[bidx], g=src[1], r=src[bidx^2];
#if defined (DEPTH_5)
__constant float * coeffs = c_RGB2YCrCbCoeffs_f;
const DATA_TYPE Y = b * coeffs[0] + g * coeffs[1] + r * coeffs[2];
const DATA_TYPE Cr = (r - Y) * coeffs[3] + HALF_MAX;
const DATA_TYPE Cb = (b - Y) * coeffs[4] + HALF_MAX;
#else
__constant int * coeffs = c_RGB2YCrCbCoeffs_i;
const int delta = HALF_MAX * (1 << yuv_shift);
const int Y = CV_DESCALE(b * coeffs[0] + g * coeffs[1] + r * coeffs[2], yuv_shift);
const int Cr = CV_DESCALE((r - Y) * coeffs[3] + delta, yuv_shift);
const int Cb = CV_DESCALE((b - Y) * coeffs[4] + delta, yuv_shift);
#endif
dst[0] = SAT_CAST( Y );
dst[1] = SAT_CAST( Cr );
dst[2] = SAT_CAST( Cb );
}
}

@ -0,0 +1,275 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Peng Xiao, pengxiao@outlook.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef WITH_MASK
#define WITH_MASK 0
#endif
__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
inline float ELEM_INT2(image2d_t _eig, int _x, int _y)
{
return read_imagef(_eig, sampler, (int2)(_x, _y)).x;
}
inline float ELEM_FLT2(image2d_t _eig, float2 pt)
{
return read_imagef(_eig, sampler, pt).x;
}
__kernel
void findCorners
(
image2d_t eig,
__global const char * mask,
__global float2 * corners,
const int mask_strip,// in pixels
const float threshold,
const int rows,
const int cols,
const int max_count,
__global int * g_counter
)
{
const int j = get_global_id(0);
const int i = get_global_id(1);
if (i > 0 && i < rows - 1 && j > 0 && j < cols - 1
#if WITH_MASK
&& mask[i * mask_strip + j] != 0
#endif
)
{
const float val = ELEM_INT2(eig, j, i);
if (val > threshold)
{
float maxVal = val;
maxVal = fmax(ELEM_INT2(eig, j - 1, i - 1), maxVal);
maxVal = fmax(ELEM_INT2(eig, j , i - 1), maxVal);
maxVal = fmax(ELEM_INT2(eig, j + 1, i - 1), maxVal);
maxVal = fmax(ELEM_INT2(eig, j - 1, i), maxVal);
maxVal = fmax(ELEM_INT2(eig, j + 1, i), maxVal);
maxVal = fmax(ELEM_INT2(eig, j - 1, i + 1), maxVal);
maxVal = fmax(ELEM_INT2(eig, j , i + 1), maxVal);
maxVal = fmax(ELEM_INT2(eig, j + 1, i + 1), maxVal);
if (val == maxVal)
{
const int ind = atomic_inc(g_counter);
if (ind < max_count)
corners[ind] = (float2)(j, i);
}
}
}
}
//bitonic sort
__kernel
void sortCorners_bitonicSort
(
image2d_t eig,
__global float2 * corners,
const int count,
const int stage,
const int passOfStage
)
{
const int threadId = get_global_id(0);
if(threadId >= count / 2)
{
return;
}
const int sortOrder = (((threadId/(1 << stage)) % 2)) == 1 ? 1 : 0; // 0 is descent
const int pairDistance = 1 << (stage - passOfStage);
const int blockWidth = 2 * pairDistance;
const int leftId = min( (threadId % pairDistance)
+ (threadId / pairDistance) * blockWidth, count );
const int rightId = min( leftId + pairDistance, count );
const float2 leftPt = corners[leftId];
const float2 rightPt = corners[rightId];
const float leftVal = ELEM_FLT2(eig, leftPt);
const float rightVal = ELEM_FLT2(eig, rightPt);
const bool compareResult = leftVal > rightVal;
float2 greater = compareResult ? leftPt:rightPt;
float2 lesser = compareResult ? rightPt:leftPt;
corners[leftId] = sortOrder ? lesser : greater;
corners[rightId] = sortOrder ? greater : lesser;
}
//selection sort for gfft
//kernel is ported from Bolt library:
//https://github.com/HSA-Libraries/Bolt/blob/master/include/bolt/cl/sort_kernels.cl
// Local sort will firstly sort elements of each workgroup using selection sort
// its performance is O(n)
__kernel
void sortCorners_selectionSortLocal
(
image2d_t eig,
__global float2 * corners,
const int count,
__local float2 * scratch
)
{
int i = get_local_id(0); // index in workgroup
int numOfGroups = get_num_groups(0); // index in workgroup
int groupID = get_group_id(0);
int wg = get_local_size(0); // workgroup size = block size
int n; // number of elements to be processed for this work group
int offset = groupID * wg;
int same = 0;
corners += offset;
n = (groupID == (numOfGroups-1))? (count - wg*(numOfGroups-1)) : wg;
float2 pt1, pt2;
pt1 = corners[min(i, n)];
scratch[i] = pt1;
barrier(CLK_LOCAL_MEM_FENCE);
if(i >= n)
{
return;
}
float val1 = ELEM_FLT2(eig, pt1);
float val2;
int pos = 0;
for (int j=0;j<n;++j)
{
pt2 = scratch[j];
val2 = ELEM_FLT2(eig, pt2);
if(val2 > val1)
pos++;//calculate the rank of this element in this work group
else
{
if(val1 > val2)
continue;
else
{
// val1 and val2 are same
same++;
}
}
}
for (int j=0; j< same; j++)
corners[pos + j] = pt1;
}
__kernel
void sortCorners_selectionSortFinal
(
image2d_t eig,
__global float2 * corners,
const int count
)
{
const int i = get_local_id(0); // index in workgroup
const int numOfGroups = get_num_groups(0); // index in workgroup
const int groupID = get_group_id(0);
const int wg = get_local_size(0); // workgroup size = block size
int pos = 0, same = 0;
const int offset = get_group_id(0) * wg;
const int remainder = count - wg*(numOfGroups-1);
if((offset + i ) >= count)
return;
float2 pt1, pt2;
pt1 = corners[groupID*wg + i];
float val1 = ELEM_FLT2(eig, pt1);
float val2;
for(int j=0; j<numOfGroups-1; j++ )
{
for(int k=0; k<wg; k++)
{
pt2 = corners[j*wg + k];
val2 = ELEM_FLT2(eig, pt2);
if(val1 > val2)
break;
else
{
//Increment only if the value is not the same.
if( val2 > val1 )
pos++;
else
same++;
}
}
}
for(int k=0; k<remainder; k++)
{
pt2 = corners[(numOfGroups-1)*wg + k];
val2 = ELEM_FLT2(eig, pt2);
if(val1 > val2)
break;
else
{
//Don't increment if the value is the same.
//Two elements are same if (*userComp)(jData, iData) and (*userComp)(iData, jData) are both false
if(val2 > val1)
pos++;
else
same++;
}
}
for (int j=0; j< same; j++)
corners[pos + j] = pt1;
}

@ -0,0 +1,202 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Shengen Yan,yanshengen@gmail.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if defined (DOUBLE_SUPPORT)
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#endif
///////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////Macro for border type////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////////////
#ifdef BORDER_REPLICATE
//BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (l_edge) : (i))
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (r_edge)-1 : (addr))
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) :(i))
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (b_edge)-1 :(addr))
#endif
#ifdef BORDER_REFLECT
//BORDER_REFLECT: fedcba|abcdefgh|hgfedcb
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i)-1 : (i))
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr))
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i)-1 : (i))
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr))
#endif
#ifdef BORDER_REFLECT101
//BORDER_REFLECT101: gfedcb|abcdefgh|gfedcba
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i) : (i))
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr))
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i) : (i))
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr))
#endif
#ifdef BORDER_WRAP
//BORDER_WRAP: cdefgh|abcdefgh|abcdefg
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (i)+(r_edge) : (i))
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (i)-(r_edge) : (addr))
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (i)+(b_edge) : (i))
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (i)-(b_edge) : (addr))
#endif
#define THREADS 256
#define ELEM(i, l_edge, r_edge, elem1, elem2) (i) >= (l_edge) && (i) < (r_edge) ? (elem1) : (elem2)
///////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////calcHarris////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////
__kernel void calcHarris(__global const float *Dx,__global const float *Dy, __global float *dst,
int dx_offset, int dx_whole_rows, int dx_whole_cols, int dx_step,
int dy_offset, int dy_whole_rows, int dy_whole_cols, int dy_step,
int dst_offset, int dst_rows, int dst_cols, int dst_step,
float k)
{
int col = get_local_id(0);
const int gX = get_group_id(0);
const int gY = get_group_id(1);
const int glx = get_global_id(0);
const int gly = get_global_id(1);
int dx_x_off = (dx_offset % dx_step) >> 2;
int dx_y_off = dx_offset / dx_step;
int dy_x_off = (dy_offset % dy_step) >> 2;
int dy_y_off = dy_offset / dy_step;
int dst_x_off = (dst_offset % dst_step) >> 2;
int dst_y_off = dst_offset / dst_step;
int dx_startX = gX * (THREADS-ksX+1) - anX + dx_x_off;
int dx_startY = (gY << 1) - anY + dx_y_off;
int dy_startX = gX * (THREADS-ksX+1) - anX + dy_x_off;
int dy_startY = (gY << 1) - anY + dy_y_off;
int dst_startX = gX * (THREADS-ksX+1) + dst_x_off;
int dst_startY = (gY << 1) + dst_y_off;
float dx_data[ksY+1],dy_data[ksY+1],data[3][ksY+1];
__local float temp[6][THREADS];
#ifdef BORDER_CONSTANT
bool dx_con,dy_con;
float dx_s,dy_s;
for(int i=0; i < ksY+1; i++)
{
dx_con = dx_startX+col >= 0 && dx_startX+col < dx_whole_cols && dx_startY+i >= 0 && dx_startY+i < dx_whole_rows;
dx_s = Dx[(dx_startY+i)*(dx_step>>2)+(dx_startX+col)];
dx_data[i] = dx_con ? dx_s : 0.0;
dy_con = dy_startX+col >= 0 && dy_startX+col < dy_whole_cols && dy_startY+i >= 0 && dy_startY+i < dy_whole_rows;
dy_s = Dy[(dy_startY+i)*(dy_step>>2)+(dy_startX+col)];
dy_data[i] = dy_con ? dy_s : 0.0;
data[0][i] = dx_data[i] * dx_data[i];
data[1][i] = dx_data[i] * dy_data[i];
data[2][i] = dy_data[i] * dy_data[i];
}
#else
int clamped_col = min(dst_cols, col);
for(int i=0; i < ksY+1; i++)
{
int dx_selected_row;
int dx_selected_col;
dx_selected_row = ADDR_H(dx_startY+i, 0, dx_whole_rows);
dx_selected_row = ADDR_B(dx_startY+i, dx_whole_rows, dx_selected_row);
dx_selected_col = ADDR_L(dx_startX+clamped_col, 0, dx_whole_cols);
dx_selected_col = ADDR_R(dx_startX+clamped_col, dx_whole_cols, dx_selected_col);
dx_data[i] = Dx[dx_selected_row * (dx_step>>2) + dx_selected_col];
int dy_selected_row;
int dy_selected_col;
dy_selected_row = ADDR_H(dy_startY+i, 0, dy_whole_rows);
dy_selected_row = ADDR_B(dy_startY+i, dy_whole_rows, dy_selected_row);
dy_selected_col = ADDR_L(dy_startX+clamped_col, 0, dy_whole_cols);
dy_selected_col = ADDR_R(dy_startX+clamped_col, dy_whole_cols, dy_selected_col);
dy_data[i] = Dy[dy_selected_row * (dy_step>>2) + dy_selected_col];
data[0][i] = dx_data[i] * dx_data[i];
data[1][i] = dx_data[i] * dy_data[i];
data[2][i] = dy_data[i] * dy_data[i];
}
#endif
float sum0 = 0.0, sum1 = 0.0, sum2 = 0.0;
for(int i=1; i < ksY; i++)
{
sum0 += (data[0][i]);
sum1 += (data[1][i]);
sum2 += (data[2][i]);
}
float sum01,sum02,sum11,sum12,sum21,sum22;
sum01 = sum0 + (data[0][0]);
sum02 = sum0 + (data[0][ksY]);
temp[0][col] = sum01;
temp[1][col] = sum02;
sum11 = sum1 + (data[1][0]);
sum12 = sum1 + (data[1][ksY]);
temp[2][col] = sum11;
temp[3][col] = sum12;
sum21 = sum2 + (data[2][0]);
sum22 = sum2 + (data[2][ksY]);
temp[4][col] = sum21;
temp[5][col] = sum22;
barrier(CLK_LOCAL_MEM_FENCE);
if(col < (THREADS-(ksX-1)))
{
col += anX;
int posX = dst_startX - dst_x_off + col - anX;
int posY = (gly << 1);
int till = (ksX + 1)%2;
float tmp_sum[6]={ 0.0, 0.0 , 0.0, 0.0, 0.0, 0.0 };
for(int k=0; k<6; k++)
for(int i=-anX; i<=anX - till; i++)
{
tmp_sum[k] += temp[k][col+i];
}
if(posX < dst_cols && (posY) < dst_rows)
{
dst[(dst_startY+0) * (dst_step>>2)+ dst_startX + col - anX] =
tmp_sum[0] * tmp_sum[4] - tmp_sum[2] * tmp_sum[2] - k * (tmp_sum[0] + tmp_sum[4]) * (tmp_sum[0] + tmp_sum[4]);
}
if(posX < dst_cols && (posY + 1) < dst_rows)
{
dst[(dst_startY+1) * (dst_step>>2)+ dst_startX + col - anX] =
tmp_sum[1] * tmp_sum[5] - tmp_sum[3] * tmp_sum[3] - k * (tmp_sum[1] + tmp_sum[5]) * (tmp_sum[1] + tmp_sum[5]);
}
}
}

@ -0,0 +1,279 @@
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Niko Li, newlife20080214@gmail.com
// Jia Haipeng, jiahaipeng95@gmail.com
// Xu Pang, pangxu010@163.com
// Wenju He, wenju@multicorewareinc.com
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//
#define PARTIAL_HISTOGRAM256_COUNT (256)
#define HISTOGRAM256_BIN_COUNT (256)
#define HISTOGRAM256_WORK_GROUP_SIZE (256)
#define HISTOGRAM256_LOCAL_MEM_SIZE (HISTOGRAM256_BIN_COUNT)
#define NBANKS (16)
#define NBANKS_BIT (4)
__kernel __attribute__((reqd_work_group_size(HISTOGRAM256_BIN_COUNT,1,1)))void calc_sub_hist_D0(
__global const uint4* src,
int src_step, int src_offset,
__global int* globalHist,
int dataCount, int cols,
int inc_x, int inc_y,
int hist_step)
{
__local int subhist[(HISTOGRAM256_BIN_COUNT << NBANKS_BIT)]; // NBINS*NBANKS
int gid = get_global_id(0);
int lid = get_local_id(0);
int gx = get_group_id(0);
int gsize = get_global_size(0);
int lsize = get_local_size(0);
const int shift = 8;
const int mask = HISTOGRAM256_BIN_COUNT-1;
int offset = (lid & (NBANKS-1));// lid % NBANKS
uint4 data, temp1, temp2, temp3, temp4;
src += src_offset;
//clear LDS
for(int i=0, idx=lid; i<(NBANKS >> 2); i++, idx += lsize)
{
subhist[idx] = 0;
subhist[idx+=lsize] = 0;
subhist[idx+=lsize] = 0;
subhist[idx+=lsize] = 0;
}
barrier(CLK_LOCAL_MEM_FENCE);
//read and scatter
int y = gid/cols;
int x = gid - mul24(y, cols);
for(int idx=gid; idx<dataCount; idx+=gsize)
{
data = src[mad24(y, src_step, x)];
temp1 = ((data & mask) << NBANKS_BIT) + offset;
data >>= shift;
temp2 = ((data & mask) << NBANKS_BIT) + offset;
data >>= shift;
temp3 = ((data & mask) << NBANKS_BIT) + offset;
data >>= shift;
temp4 = ((data & mask) << NBANKS_BIT) + offset;
atomic_inc(subhist + temp1.x);
atomic_inc(subhist + temp1.y);
atomic_inc(subhist + temp1.z);
atomic_inc(subhist + temp1.w);
atomic_inc(subhist + temp2.x);
atomic_inc(subhist + temp2.y);
atomic_inc(subhist + temp2.z);
atomic_inc(subhist + temp2.w);
atomic_inc(subhist + temp3.x);
atomic_inc(subhist + temp3.y);
atomic_inc(subhist + temp3.z);
atomic_inc(subhist + temp3.w);
atomic_inc(subhist + temp4.x);
atomic_inc(subhist + temp4.y);
atomic_inc(subhist + temp4.z);
atomic_inc(subhist + temp4.w);
x += inc_x;
int off = ((x>=cols) ? -1 : 0);
x = mad24(off, cols, x);
y += inc_y - off;
}
barrier(CLK_LOCAL_MEM_FENCE);
//reduce local banks to single histogram per workgroup
int bin1=0, bin2=0, bin3=0, bin4=0;
for(int i=0; i<NBANKS; i+=4)
{
bin1 += subhist[(lid << NBANKS_BIT) + i];
bin2 += subhist[(lid << NBANKS_BIT) + i+1];
bin3 += subhist[(lid << NBANKS_BIT) + i+2];
bin4 += subhist[(lid << NBANKS_BIT) + i+3];
}
globalHist[mad24(gx, hist_step, lid)] = bin1+bin2+bin3+bin4;
}
__kernel void __attribute__((reqd_work_group_size(1,HISTOGRAM256_BIN_COUNT,1)))
calc_sub_hist_border_D0(__global const uchar* src, int src_step, int src_offset,
__global int* globalHist, int left_col, int cols,
int rows, int hist_step)
{
int gidx = get_global_id(0);
int gidy = get_global_id(1);
int lidy = get_local_id(1);
int gx = get_group_id(0);
int gy = get_group_id(1);
int gn = get_num_groups(0);
int rowIndex = mad24(gy, gn, gx);
// rowIndex &= (PARTIAL_HISTOGRAM256_COUNT - 1);
__local int subhist[HISTOGRAM256_LOCAL_MEM_SIZE];
subhist[lidy] = 0;
barrier(CLK_LOCAL_MEM_FENCE);
gidx = ((gidx>=left_col) ? (gidx+cols) : gidx);
if(gidy<rows)
{
int src_index = src_offset + mad24(gidy, src_step, gidx);
int p = (int)src[src_index];
// p = gidy >= rows ? HISTOGRAM256_LOCAL_MEM_SIZE : p;
atomic_inc(subhist + p);
}
barrier(CLK_LOCAL_MEM_FENCE);
globalHist[mad24(rowIndex, hist_step, lidy)] += subhist[lidy];
}
__kernel __attribute__((reqd_work_group_size(256,1,1)))void merge_hist(__global int* buf,
__global int* hist,
int src_step)
{
int lx = get_local_id(0);
int gx = get_group_id(0);
int sum = 0;
for(int i = lx; i < PARTIAL_HISTOGRAM256_COUNT; i += HISTOGRAM256_WORK_GROUP_SIZE)
sum += buf[ mad24(i, src_step, gx)];
__local int data[HISTOGRAM256_WORK_GROUP_SIZE];
data[lx] = sum;
for(int stride = HISTOGRAM256_WORK_GROUP_SIZE /2; stride > 0; stride >>= 1)
{
barrier(CLK_LOCAL_MEM_FENCE);
if(lx < stride)
data[lx] += data[lx + stride];
}
if(lx == 0)
hist[gx] = data[0];
}
__kernel __attribute__((reqd_work_group_size(256,1,1)))
void calLUT(__global uchar * dst, __constant int * hist, int total)
{
int lid = get_local_id(0);
__local int sumhist[HISTOGRAM256_BIN_COUNT];
__local float scale;
sumhist[lid] = hist[lid];
barrier(CLK_LOCAL_MEM_FENCE);
if (lid == 0)
{
int sum = 0, i = 0;
while (!sumhist[i])
++i;
if (total == sumhist[i])
{
scale = 1;
for (int j = 0; j < HISTOGRAM256_BIN_COUNT; ++j)
sumhist[i] = i;
}
else
{
scale = 255.f/(total - sumhist[i]);
for (sumhist[i++] = 0; i < HISTOGRAM256_BIN_COUNT; i++)
{
sum += sumhist[i];
sumhist[i] = sum;
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
dst[lid]= convert_uchar_sat_rte(convert_float(sumhist[lid])*scale);
}
/*
///////////////////////////////equalizeHist//////////////////////////////////////////////////
__kernel __attribute__((reqd_work_group_size(256,1,1)))void equalizeHist(
__global uchar * src,
__global uchar * dst,
__constant int * hist,
int srcstep,
int srcoffset,
int dststep,
int dstoffset,
int width,
int height,
float scale,
int inc_x,
int inc_y)
{
int gidx = get_global_id(0);
int lid = get_local_id(0);
int glb_size = get_global_size(0);
src+=srcoffset;
dst+=dstoffset;
__local int sumhist[HISTOGRAM256_BIN_COUNT];
__local uchar lut[HISTOGRAM256_BIN_COUNT+1];
sumhist[lid]=hist[lid];
barrier(CLK_LOCAL_MEM_FENCE);
if(lid==0)
{
int sum = 0;
for(int i=0;i<HISTOGRAM256_BIN_COUNT;i++)
{
sum+=sumhist[i];
sumhist[i]=sum;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
lut[lid]= convert_uchar_sat(convert_float(sumhist[lid])*scale);
lut[0]=0;
int pos_y = gidx / width;
int pos_x = gidx - mul24(pos_y, width);
for(int pos = gidx; pos < mul24(width,height); pos += glb_size)
{
int inaddr = mad24(pos_y,srcstep,pos_x);
int outaddr = mad24(pos_y,dststep,pos_x);
dst[outaddr] = lut[src[inaddr]];
pos_x +=inc_x;
int off = (pos_x >= width ? -1 : 0);
pos_x = mad24(off,width,pos_x);
pos_y += inc_y - off;
}
}
*/

@ -0,0 +1,280 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or bpied warranties, including, but not limited to, the bpied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
////////////////////////////////////////////////////////////////////////
// buildPointList
#define PIXELS_PER_THREAD 16
// TODO: add offset to support ROI
__kernel void buildPointList(__global const uchar* src,
int cols,
int rows,
int step,
__global unsigned int* list,
__global int* counter)
{
__local unsigned int s_queues[4][32 * PIXELS_PER_THREAD];
__local int s_qsize[4];
__local int s_globStart[4];
const int x = get_group_id(0) * get_local_size(0) * PIXELS_PER_THREAD + get_local_id(0);
const int y = get_global_id(1);
if (get_local_id(0) == 0)
s_qsize[get_local_id(1)] = 0;
barrier(CLK_LOCAL_MEM_FENCE);
if (y < rows)
{
// fill the queue
__global const uchar* srcRow = &src[y * step];
for (int i = 0, xx = x; i < PIXELS_PER_THREAD && xx < cols; ++i, xx += get_local_size(0))
{
if (srcRow[xx])
{
const unsigned int val = (y << 16) | xx;
const int qidx = atomic_add(&s_qsize[get_local_id(1)], 1);
s_queues[get_local_id(1)][qidx] = val;
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
// let one work-item reserve the space required in the global list
if (get_local_id(0) == 0 && get_local_id(1) == 0)
{
// find how many items are stored in each list
int totalSize = 0;
for (int i = 0; i < get_local_size(1); ++i)
{
s_globStart[i] = totalSize;
totalSize += s_qsize[i];
}
// calculate the offset in the global list
const int globalOffset = atomic_add(counter, totalSize);
for (int i = 0; i < get_local_size(1); ++i)
s_globStart[i] += globalOffset;
}
barrier(CLK_GLOBAL_MEM_FENCE);
// copy local queues to global queue
const int qsize = s_qsize[get_local_id(1)];
int gidx = s_globStart[get_local_id(1)] + get_local_id(0);
for(int i = get_local_id(0); i < qsize; i += get_local_size(0), gidx += get_local_size(0))
list[gidx] = s_queues[get_local_id(1)][i];
}
////////////////////////////////////////////////////////////////////////
// circlesAccumCenters
// TODO: add offset to support ROI
__kernel void circlesAccumCenters(__global const unsigned int* list,
const int count,
__global const int* dx,
const int dxStep,
__global const int* dy,
const int dyStep,
__global int* accum,
const int accumStep,
const int width,
const int height,
const int minRadius,
const int maxRadius,
const float idp)
{
const int dxStepInPixel = dxStep / sizeof(int);
const int dyStepInPixel = dyStep / sizeof(int);
const int accumStepInPixel = accumStep / sizeof(int);
const int SHIFT = 10;
const int ONE = 1 << SHIFT;
// const int tid = blockIdx.x * blockDim.x + threadIdx.x;
const int wid = get_global_id(0);
if (wid >= count)
return;
const unsigned int val = list[wid];
const int x = (val & 0xFFFF);
const int y = (val >> 16) & 0xFFFF;
const int vx = dx[mad24(y, dxStepInPixel, x)];
const int vy = dy[mad24(y, dyStepInPixel, x)];
if (vx == 0 && vy == 0)
return;
const float mag = sqrt(convert_float(vx * vx + vy * vy));
const int x0 = convert_int_rte((x * idp) * ONE);
const int y0 = convert_int_rte((y * idp) * ONE);
int sx = convert_int_rte((vx * idp) * ONE / mag);
int sy = convert_int_rte((vy * idp) * ONE / mag);
// Step from minRadius to maxRadius in both directions of the gradient
for (int k1 = 0; k1 < 2; ++k1)
{
int x1 = x0 + minRadius * sx;
int y1 = y0 + minRadius * sy;
for (int r = minRadius; r <= maxRadius; x1 += sx, y1 += sy, ++r)
{
const int x2 = x1 >> SHIFT;
const int y2 = y1 >> SHIFT;
if (x2 < 0 || x2 >= width || y2 < 0 || y2 >= height)
break;
atomic_add(&accum[mad24(y2+1, accumStepInPixel, x2+1)], 1);
}
sx = -sx;
sy = -sy;
}
}
// ////////////////////////////////////////////////////////////////////////
// // buildCentersList
// TODO: add offset to support ROI
__kernel void buildCentersList(__global const int* accum,
const int accumCols,
const int accumRows,
const int accumStep,
__global unsigned int* centers,
const int threshold,
__global int* counter)
{
const int accumStepInPixel = accumStep/sizeof(int);
const int x = get_global_id(0);
const int y = get_global_id(1);
if (x < accumCols - 2 && y < accumRows - 2)
{
const int top = accum[mad24(y, accumStepInPixel, x + 1)];
const int left = accum[mad24(y + 1, accumStepInPixel, x)];
const int cur = accum[mad24(y + 1, accumStepInPixel, x + 1)];
const int right = accum[mad24(y + 1, accumStepInPixel, x + 2)];
const int bottom = accum[mad24(y + 2, accumStepInPixel, x + 1)];;
if (cur > threshold && cur > top && cur >= bottom && cur > left && cur >= right)
{
const unsigned int val = (y << 16) | x;
const int idx = atomic_add(counter, 1);
centers[idx] = val;
}
}
}
// ////////////////////////////////////////////////////////////////////////
// // circlesAccumRadius
// TODO: add offset to support ROI
__kernel void circlesAccumRadius(__global const unsigned int* centers,
__global const unsigned int* list, const int count,
__global float4* circles, const int maxCircles,
const float dp,
const int minRadius, const int maxRadius,
const int histSize,
const int threshold,
__local int* smem,
__global int* counter)
{
for (int i = get_local_id(0); i < histSize + 2; i += get_local_size(0))
smem[i] = 0;
barrier(CLK_LOCAL_MEM_FENCE);
unsigned int val = centers[get_group_id(0)];
float cx = convert_float(val & 0xFFFF);
float cy = convert_float((val >> 16) & 0xFFFF);
cx = (cx + 0.5f) * dp;
cy = (cy + 0.5f) * dp;
for (int i = get_local_id(0); i < count; i += get_local_size(0))
{
val = list[i];
const int x = (val & 0xFFFF);
const int y = (val >> 16) & 0xFFFF;
const float rad = sqrt((cx - x) * (cx - x) + (cy - y) * (cy - y));
if (rad >= minRadius && rad <= maxRadius)
{
const int r = convert_int_rte(rad - minRadius);
atomic_add(&smem[r + 1], 1);
}
}
barrier(CLK_LOCAL_MEM_FENCE);
for (int i = get_local_id(0); i < histSize; i += get_local_size(0))
{
const int curVotes = smem[i + 1];
if (curVotes >= threshold && curVotes > smem[i] && curVotes >= smem[i + 2])
{
const int ind = atomic_add(counter, 1);
if (ind < maxCircles)
{
circles[ind] = (float4)(cx, cy, convert_float(i + minRadius), 0.0f);
}
}
}
}

@ -0,0 +1,493 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Shengen Yan,yanshengen@gmail.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if defined (DOUBLE_SUPPORT)
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#elif defined (cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#endif
#endif
#define LSIZE 256
#define LSIZE_1 255
#define LSIZE_2 254
#define HF_LSIZE 128
#define LOG_LSIZE 8
#define LOG_NUM_BANKS 5
#define NUM_BANKS 32
#define GET_CONFLICT_OFFSET(lid) ((lid) >> LOG_NUM_BANKS)
kernel void integral_cols_D4(__global uchar4 *src,__global int *sum ,__global float *sqsum,
int src_offset,int pre_invalid,int rows,int cols,int src_step,int dst_step)
{
unsigned int lid = get_local_id(0);
unsigned int gid = get_group_id(0);
int4 src_t[2], sum_t[2];
float4 sqsum_t[2];
__local int4 lm_sum[2][LSIZE + LOG_LSIZE];
__local float4 lm_sqsum[2][LSIZE + LOG_LSIZE];
__local int* sum_p;
__local float* sqsum_p;
src_step = src_step >> 2;
gid = gid << 1;
for(int i = 0; i < rows; i =i + LSIZE_1)
{
src_t[0] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + min(gid, (uint)cols - 1)]) : 0);
src_t[1] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + min(gid + 1, (uint)cols - 1)]) : 0);
sum_t[0] = (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
sqsum_t[0] = (i == 0 ? (float4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]);
sum_t[1] = (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
sqsum_t[1] = (i == 0 ? (float4)0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]);
barrier(CLK_LOCAL_MEM_FENCE);
int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
lm_sum[0][bf_loc] = src_t[0];
lm_sqsum[0][bf_loc] = convert_float4(src_t[0] * src_t[0]);
lm_sum[1][bf_loc] = src_t[1];
lm_sqsum[1][bf_loc] = convert_float4(src_t[1] * src_t[1]);
int offset = 1;
for(int d = LSIZE >> 1 ; d > 0; d>>=1)
{
barrier(CLK_LOCAL_MEM_FENCE);
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
ai += GET_CONFLICT_OFFSET(ai);
bi += GET_CONFLICT_OFFSET(bi);
if((lid & 127) < d)
{
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai];
}
offset <<= 1;
}
barrier(CLK_LOCAL_MEM_FENCE);
if(lid < 2)
{
lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
lm_sqsum[lid][LSIZE_2 + LOG_LSIZE] = 0;
}
for(int d = 1; d < LSIZE; d <<= 1)
{
barrier(CLK_LOCAL_MEM_FENCE);
offset >>= 1;
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
ai += GET_CONFLICT_OFFSET(ai);
bi += GET_CONFLICT_OFFSET(bi);
if((lid & 127) < d)
{
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai];
lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai];
lm_sqsum[lid >> 7][ai] = lm_sqsum[lid >> 7][bi] - lm_sqsum[lid >> 7][ai];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
int loc_s0 = gid * dst_step + i + lid - 1 - pre_invalid * dst_step / 4, loc_s1 = loc_s0 + dst_step ;
if(lid > 0 && (i+lid) <= rows)
{
lm_sum[0][bf_loc] += sum_t[0];
lm_sum[1][bf_loc] += sum_t[1];
lm_sqsum[0][bf_loc] += sqsum_t[0];
lm_sqsum[1][bf_loc] += sqsum_t[1];
sum_p = (__local int*)(&(lm_sum[0][bf_loc]));
sqsum_p = (__local float*)(&(lm_sqsum[0][bf_loc]));
for(int k = 0; k < 4; k++)
{
if(gid * 4 + k >= cols + pre_invalid || gid * 4 + k < pre_invalid) continue;
sum[loc_s0 + k * dst_step / 4] = sum_p[k];
sqsum[loc_s0 + k * dst_step / 4] = sqsum_p[k];
}
sum_p = (__local int*)(&(lm_sum[1][bf_loc]));
sqsum_p = (__local float*)(&(lm_sqsum[1][bf_loc]));
for(int k = 0; k < 4; k++)
{
if(gid * 4 + k + 4 >= cols + pre_invalid) break;
sum[loc_s1 + k * dst_step / 4] = sum_p[k];
sqsum[loc_s1 + k * dst_step / 4] = sqsum_p[k];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
}
kernel void integral_rows_D4(__global int4 *srcsum,__global float4 * srcsqsum,__global int *sum ,
__global float *sqsum,int rows,int cols,int src_step,int sum_step,
int sqsum_step,int sum_offset,int sqsum_offset)
{
unsigned int lid = get_local_id(0);
unsigned int gid = get_group_id(0);
int4 src_t[2], sum_t[2];
float4 sqsrc_t[2],sqsum_t[2];
__local int4 lm_sum[2][LSIZE + LOG_LSIZE];
__local float4 lm_sqsum[2][LSIZE + LOG_LSIZE];
__local int *sum_p;
__local float *sqsum_p;
src_step = src_step >> 4;
for(int i = 0; i < rows; i =i + LSIZE_1)
{
src_t[0] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2] : (int4)0;
sqsrc_t[0] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2] : (float4)0;
src_t[1] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2 + 1] : (int4)0;
sqsrc_t[1] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2 + 1] : (float4)0;
sum_t[0] = (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
sqsum_t[0] = (i == 0 ? (float4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]);
sum_t[1] = (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
sqsum_t[1] = (i == 0 ? (float4)0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]);
barrier(CLK_LOCAL_MEM_FENCE);
int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
lm_sum[0][bf_loc] = src_t[0];
lm_sqsum[0][bf_loc] = sqsrc_t[0];
lm_sum[1][bf_loc] = src_t[1];
lm_sqsum[1][bf_loc] = sqsrc_t[1];
int offset = 1;
for(int d = LSIZE >> 1 ; d > 0; d>>=1)
{
barrier(CLK_LOCAL_MEM_FENCE);
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
ai += GET_CONFLICT_OFFSET(ai);
bi += GET_CONFLICT_OFFSET(bi);
if((lid & 127) < d)
{
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai];
}
offset <<= 1;
}
barrier(CLK_LOCAL_MEM_FENCE);
if(lid < 2)
{
lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
lm_sqsum[lid][LSIZE_2 + LOG_LSIZE] = 0;
}
for(int d = 1; d < LSIZE; d <<= 1)
{
barrier(CLK_LOCAL_MEM_FENCE);
offset >>= 1;
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
ai += GET_CONFLICT_OFFSET(ai);
bi += GET_CONFLICT_OFFSET(bi);
if((lid & 127) < d)
{
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai];
lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai];
lm_sqsum[lid >> 7][ai] = lm_sqsum[lid >> 7][bi] - lm_sqsum[lid >> 7][ai];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if(gid == 0 && (i + lid) <= rows)
{
sum[sum_offset + i + lid] = 0;
sqsum[sqsum_offset + i + lid] = 0;
}
if(i + lid == 0)
{
int loc0 = gid * 2 * sum_step;
int loc1 = gid * 2 * sqsum_step;
for(int k = 1; k <= 8; k++)
{
if(gid * 8 + k > cols) break;
sum[sum_offset + loc0 + k * sum_step / 4] = 0;
sqsum[sqsum_offset + loc1 + k * sqsum_step / 4] = 0;
}
}
int loc_s0 = sum_offset + gid * 2 * sum_step + sum_step / 4 + i + lid, loc_s1 = loc_s0 + sum_step ;
int loc_sq0 = sqsum_offset + gid * 2 * sqsum_step + sqsum_step / 4 + i + lid, loc_sq1 = loc_sq0 + sqsum_step ;
if(lid > 0 && (i+lid) <= rows)
{
lm_sum[0][bf_loc] += sum_t[0];
lm_sum[1][bf_loc] += sum_t[1];
lm_sqsum[0][bf_loc] += sqsum_t[0];
lm_sqsum[1][bf_loc] += sqsum_t[1];
sum_p = (__local int*)(&(lm_sum[0][bf_loc]));
sqsum_p = (__local float*)(&(lm_sqsum[0][bf_loc]));
for(int k = 0; k < 4; k++)
{
if(gid * 8 + k >= cols) break;
sum[loc_s0 + k * sum_step / 4] = sum_p[k];
sqsum[loc_sq0 + k * sqsum_step / 4] = sqsum_p[k];
}
sum_p = (__local int*)(&(lm_sum[1][bf_loc]));
sqsum_p = (__local float*)(&(lm_sqsum[1][bf_loc]));
for(int k = 0; k < 4; k++)
{
if(gid * 8 + 4 + k >= cols) break;
sum[loc_s1 + k * sum_step / 4] = sum_p[k];
sqsum[loc_sq1 + k * sqsum_step / 4] = sqsum_p[k];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
}
kernel void integral_cols_D5(__global uchar4 *src,__global float *sum ,__global float *sqsum,
int src_offset,int pre_invalid,int rows,int cols,int src_step,int dst_step)
{
unsigned int lid = get_local_id(0);
unsigned int gid = get_group_id(0);
float4 src_t[2], sum_t[2];
float4 sqsum_t[2];
__local float4 lm_sum[2][LSIZE + LOG_LSIZE];
__local float4 lm_sqsum[2][LSIZE + LOG_LSIZE];
__local float* sum_p;
__local float* sqsum_p;
src_step = src_step >> 2;
gid = gid << 1;
for(int i = 0; i < rows; i =i + LSIZE_1)
{
src_t[0] = (i + lid < rows ? convert_float4(src[src_offset + (lid+i) * src_step + min(gid, (uint)cols - 1)]) : (float4)0);
src_t[1] = (i + lid < rows ? convert_float4(src[src_offset + (lid+i) * src_step + min(gid + 1, (uint)cols - 1)]) : (float4)0);
sum_t[0] = (i == 0 ? (float4)0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
sqsum_t[0] = (i == 0 ? (float4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]);
sum_t[1] = (i == 0 ? (float4)0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
sqsum_t[1] = (i == 0 ? (float4)0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]);
barrier(CLK_LOCAL_MEM_FENCE);
int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
lm_sum[0][bf_loc] = src_t[0];
lm_sqsum[0][bf_loc] = convert_float4(src_t[0] * src_t[0]);
lm_sum[1][bf_loc] = src_t[1];
lm_sqsum[1][bf_loc] = convert_float4(src_t[1] * src_t[1]);
int offset = 1;
for(int d = LSIZE >> 1 ; d > 0; d>>=1)
{
barrier(CLK_LOCAL_MEM_FENCE);
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
ai += GET_CONFLICT_OFFSET(ai);
bi += GET_CONFLICT_OFFSET(bi);
if((lid & 127) < d)
{
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai];
}
offset <<= 1;
}
barrier(CLK_LOCAL_MEM_FENCE);
if(lid < 2)
{
lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
lm_sqsum[lid][LSIZE_2 + LOG_LSIZE] = 0;
}
for(int d = 1; d < LSIZE; d <<= 1)
{
barrier(CLK_LOCAL_MEM_FENCE);
offset >>= 1;
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
ai += GET_CONFLICT_OFFSET(ai);
bi += GET_CONFLICT_OFFSET(bi);
if((lid & 127) < d)
{
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai];
lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai];
lm_sqsum[lid >> 7][ai] = lm_sqsum[lid >> 7][bi] - lm_sqsum[lid >> 7][ai];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
int loc_s0 = gid * dst_step + i + lid - 1 - pre_invalid * dst_step / 4, loc_s1 = loc_s0 + dst_step ;
if(lid > 0 && (i+lid) <= rows)
{
lm_sum[0][bf_loc] += sum_t[0];
lm_sum[1][bf_loc] += sum_t[1];
lm_sqsum[0][bf_loc] += sqsum_t[0];
lm_sqsum[1][bf_loc] += sqsum_t[1];
sum_p = (__local float*)(&(lm_sum[0][bf_loc]));
sqsum_p = (__local float*)(&(lm_sqsum[0][bf_loc]));
for(int k = 0; k < 4; k++)
{
if(gid * 4 + k >= cols + pre_invalid || gid * 4 + k < pre_invalid) continue;
sum[loc_s0 + k * dst_step / 4] = sum_p[k];
sqsum[loc_s0 + k * dst_step / 4] = sqsum_p[k];
}
sum_p = (__local float*)(&(lm_sum[1][bf_loc]));
sqsum_p = (__local float*)(&(lm_sqsum[1][bf_loc]));
for(int k = 0; k < 4; k++)
{
if(gid * 4 + k + 4 >= cols + pre_invalid) break;
sum[loc_s1 + k * dst_step / 4] = sum_p[k];
sqsum[loc_s1 + k * dst_step / 4] = sqsum_p[k];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
}
kernel void integral_rows_D5(__global float4 *srcsum,__global float4 * srcsqsum,__global float *sum ,
__global float *sqsum,int rows,int cols,int src_step,int sum_step,
int sqsum_step,int sum_offset,int sqsum_offset)
{
unsigned int lid = get_local_id(0);
unsigned int gid = get_group_id(0);
float4 src_t[2], sum_t[2];
float4 sqsrc_t[2],sqsum_t[2];
__local float4 lm_sum[2][LSIZE + LOG_LSIZE];
__local float4 lm_sqsum[2][LSIZE + LOG_LSIZE];
__local float *sum_p;
__local float *sqsum_p;
src_step = src_step >> 4;
for(int i = 0; i < rows; i =i + LSIZE_1)
{
src_t[0] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2] : (float4)0;
sqsrc_t[0] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2] : (float4)0;
src_t[1] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2 + 1] : (float4)0;
sqsrc_t[1] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2 + 1] : (float4)0;
sum_t[0] = (i == 0 ? (float4)0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
sqsum_t[0] = (i == 0 ? (float4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]);
sum_t[1] = (i == 0 ? (float4)0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
sqsum_t[1] = (i == 0 ? (float4)0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]);
barrier(CLK_LOCAL_MEM_FENCE);
int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
lm_sum[0][bf_loc] = src_t[0];
lm_sqsum[0][bf_loc] = sqsrc_t[0];
lm_sum[1][bf_loc] = src_t[1];
lm_sqsum[1][bf_loc] = sqsrc_t[1];
int offset = 1;
for(int d = LSIZE >> 1 ; d > 0; d>>=1)
{
barrier(CLK_LOCAL_MEM_FENCE);
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
ai += GET_CONFLICT_OFFSET(ai);
bi += GET_CONFLICT_OFFSET(bi);
if((lid & 127) < d)
{
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai];
}
offset <<= 1;
}
barrier(CLK_LOCAL_MEM_FENCE);
if(lid < 2)
{
lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
lm_sqsum[lid][LSIZE_2 + LOG_LSIZE] = 0;
}
for(int d = 1; d < LSIZE; d <<= 1)
{
barrier(CLK_LOCAL_MEM_FENCE);
offset >>= 1;
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
ai += GET_CONFLICT_OFFSET(ai);
bi += GET_CONFLICT_OFFSET(bi);
if((lid & 127) < d)
{
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai];
lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai];
lm_sqsum[lid >> 7][ai] = lm_sqsum[lid >> 7][bi] - lm_sqsum[lid >> 7][ai];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if(gid == 0 && (i + lid) <= rows)
{
sum[sum_offset + i + lid] = 0;
sqsum[sqsum_offset + i + lid] = 0;
}
if(i + lid == 0)
{
int loc0 = gid * 2 * sum_step;
int loc1 = gid * 2 * sqsum_step;
for(int k = 1; k <= 8; k++)
{
if(gid * 8 + k > cols) break;
sum[sum_offset + loc0 + k * sum_step / 4] = 0;
sqsum[sqsum_offset + loc1 + k * sqsum_step / 4] = 0;
}
}
int loc_s0 = sum_offset + gid * 2 * sum_step + sum_step / 4 + i + lid, loc_s1 = loc_s0 + sum_step ;
int loc_sq0 = sqsum_offset + gid * 2 * sqsum_step + sqsum_step / 4 + i + lid, loc_sq1 = loc_sq0 + sqsum_step ;
if(lid > 0 && (i+lid) <= rows)
{
lm_sum[0][bf_loc] += sum_t[0];
lm_sum[1][bf_loc] += sum_t[1];
lm_sqsum[0][bf_loc] += sqsum_t[0];
lm_sqsum[1][bf_loc] += sqsum_t[1];
sum_p = (__local float*)(&(lm_sum[0][bf_loc]));
sqsum_p = (__local float*)(&(lm_sqsum[0][bf_loc]));
for(int k = 0; k < 4; k++)
{
if(gid * 8 + k >= cols) break;
sum[loc_s0 + k * sum_step / 4] = sum_p[k];
sqsum[loc_sq0 + k * sqsum_step / 4] = sqsum_p[k];
}
sum_p = (__local float*)(&(lm_sum[1][bf_loc]));
sqsum_p = (__local float*)(&(lm_sqsum[1][bf_loc]));
for(int k = 0; k < 4; k++)
{
if(gid * 8 + 4 + k >= cols) break;
sum[loc_s1 + k * sum_step / 4] = sum_p[k];
sqsum[loc_sq1 + k * sqsum_step / 4] = sqsum_p[k];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
}

@ -0,0 +1,412 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Shengen Yan,yanshengen@gmail.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if defined (DOUBLE_SUPPORT)
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#elif defined (cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#endif
#endif
#define LSIZE 256
#define LSIZE_1 255
#define LSIZE_2 254
#define HF_LSIZE 128
#define LOG_LSIZE 8
#define LOG_NUM_BANKS 5
#define NUM_BANKS 32
#define GET_CONFLICT_OFFSET(lid) ((lid) >> LOG_NUM_BANKS)
kernel void integral_sum_cols_D4(__global uchar4 *src,__global int *sum ,
int src_offset,int pre_invalid,int rows,int cols,int src_step,int dst_step)
{
unsigned int lid = get_local_id(0);
unsigned int gid = get_group_id(0);
int4 src_t[2], sum_t[2];
__local int4 lm_sum[2][LSIZE + LOG_LSIZE];
__local int* sum_p;
src_step = src_step >> 2;
gid = gid << 1;
for(int i = 0; i < rows; i =i + LSIZE_1)
{
src_t[0] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + gid]) : 0);
src_t[1] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + gid + 1]) : 0);
sum_t[0] = (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
sum_t[1] = (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
barrier(CLK_LOCAL_MEM_FENCE);
int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
lm_sum[0][bf_loc] = src_t[0];
lm_sum[1][bf_loc] = src_t[1];
int offset = 1;
for(int d = LSIZE >> 1 ; d > 0; d>>=1)
{
barrier(CLK_LOCAL_MEM_FENCE);
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
ai += GET_CONFLICT_OFFSET(ai);
bi += GET_CONFLICT_OFFSET(bi);
if((lid & 127) < d)
{
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
}
offset <<= 1;
}
barrier(CLK_LOCAL_MEM_FENCE);
if(lid < 2)
{
lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
}
for(int d = 1; d < LSIZE; d <<= 1)
{
barrier(CLK_LOCAL_MEM_FENCE);
offset >>= 1;
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
ai += GET_CONFLICT_OFFSET(ai);
bi += GET_CONFLICT_OFFSET(bi);
if((lid & 127) < d)
{
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if(lid > 0 && (i+lid) <= rows)
{
int loc_s0 = gid * dst_step + i + lid - 1 - pre_invalid * dst_step / 4, loc_s1 = loc_s0 + dst_step ;
lm_sum[0][bf_loc] += sum_t[0];
lm_sum[1][bf_loc] += sum_t[1];
sum_p = (__local int*)(&(lm_sum[0][bf_loc]));
for(int k = 0; k < 4; k++)
{
if(gid * 4 + k >= cols + pre_invalid || gid * 4 + k < pre_invalid) continue;
sum[loc_s0 + k * dst_step / 4] = sum_p[k];
}
sum_p = (__local int*)(&(lm_sum[1][bf_loc]));
for(int k = 0; k < 4; k++)
{
if(gid * 4 + k + 4 >= cols + pre_invalid) break;
sum[loc_s1 + k * dst_step / 4] = sum_p[k];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
}
kernel void integral_sum_rows_D4(__global int4 *srcsum,__global int *sum ,
int rows,int cols,int src_step,int sum_step,
int sum_offset)
{
unsigned int lid = get_local_id(0);
unsigned int gid = get_group_id(0);
int4 src_t[2], sum_t[2];
__local int4 lm_sum[2][LSIZE + LOG_LSIZE];
__local int *sum_p;
src_step = src_step >> 4;
for(int i = 0; i < rows; i =i + LSIZE_1)
{
src_t[0] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2] : 0;
src_t[1] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2 + 1] : 0;
sum_t[0] = (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
sum_t[1] = (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
barrier(CLK_LOCAL_MEM_FENCE);
int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
lm_sum[0][bf_loc] = src_t[0];
lm_sum[1][bf_loc] = src_t[1];
int offset = 1;
for(int d = LSIZE >> 1 ; d > 0; d>>=1)
{
barrier(CLK_LOCAL_MEM_FENCE);
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
ai += GET_CONFLICT_OFFSET(ai);
bi += GET_CONFLICT_OFFSET(bi);
if((lid & 127) < d)
{
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
}
offset <<= 1;
}
barrier(CLK_LOCAL_MEM_FENCE);
if(lid < 2)
{
lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
}
for(int d = 1; d < LSIZE; d <<= 1)
{
barrier(CLK_LOCAL_MEM_FENCE);
offset >>= 1;
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
ai += GET_CONFLICT_OFFSET(ai);
bi += GET_CONFLICT_OFFSET(bi);
if((lid & 127) < d)
{
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if(gid == 0 && (i + lid) <= rows)
{
sum[sum_offset + i + lid] = 0;
}
if(i + lid == 0)
{
int loc0 = gid * 2 * sum_step;
for(int k = 1; k <= 8; k++)
{
if(gid * 8 + k > cols) break;
sum[sum_offset + loc0 + k * sum_step / 4] = 0;
}
}
if(lid > 0 && (i+lid) <= rows)
{
int loc_s0 = sum_offset + gid * 2 * sum_step + sum_step / 4 + i + lid, loc_s1 = loc_s0 + sum_step ;
lm_sum[0][bf_loc] += sum_t[0];
lm_sum[1][bf_loc] += sum_t[1];
sum_p = (__local int*)(&(lm_sum[0][bf_loc]));
for(int k = 0; k < 4; k++)
{
if(gid * 8 + k >= cols) break;
sum[loc_s0 + k * sum_step / 4] = sum_p[k];
}
sum_p = (__local int*)(&(lm_sum[1][bf_loc]));
for(int k = 0; k < 4; k++)
{
if(gid * 8 + 4 + k >= cols) break;
sum[loc_s1 + k * sum_step / 4] = sum_p[k];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
}
kernel void integral_sum_cols_D5(__global uchar4 *src,__global float *sum ,
int src_offset,int pre_invalid,int rows,int cols,int src_step,int dst_step)
{
unsigned int lid = get_local_id(0);
unsigned int gid = get_group_id(0);
float4 src_t[2], sum_t[2];
__local float4 lm_sum[2][LSIZE + LOG_LSIZE];
__local float* sum_p;
src_step = src_step >> 2;
gid = gid << 1;
for(int i = 0; i < rows; i =i + LSIZE_1)
{
src_t[0] = (i + lid < rows ? convert_float4(src[src_offset + (lid+i) * src_step + gid]) : (float4)0);
src_t[1] = (i + lid < rows ? convert_float4(src[src_offset + (lid+i) * src_step + gid + 1]) : (float4)0);
sum_t[0] = (i == 0 ? (float4)0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
sum_t[1] = (i == 0 ? (float4)0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
barrier(CLK_LOCAL_MEM_FENCE);
int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
lm_sum[0][bf_loc] = src_t[0];
lm_sum[1][bf_loc] = src_t[1];
int offset = 1;
for(int d = LSIZE >> 1 ; d > 0; d>>=1)
{
barrier(CLK_LOCAL_MEM_FENCE);
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
ai += GET_CONFLICT_OFFSET(ai);
bi += GET_CONFLICT_OFFSET(bi);
if((lid & 127) < d)
{
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
}
offset <<= 1;
}
barrier(CLK_LOCAL_MEM_FENCE);
if(lid < 2)
{
lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
}
for(int d = 1; d < LSIZE; d <<= 1)
{
barrier(CLK_LOCAL_MEM_FENCE);
offset >>= 1;
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
ai += GET_CONFLICT_OFFSET(ai);
bi += GET_CONFLICT_OFFSET(bi);
if((lid & 127) < d)
{
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if(lid > 0 && (i+lid) <= rows)
{
int loc_s0 = gid * dst_step + i + lid - 1 - pre_invalid * dst_step / 4, loc_s1 = loc_s0 + dst_step ;
lm_sum[0][bf_loc] += sum_t[0];
lm_sum[1][bf_loc] += sum_t[1];
sum_p = (__local float*)(&(lm_sum[0][bf_loc]));
for(int k = 0; k < 4; k++)
{
if(gid * 4 + k >= cols + pre_invalid || gid * 4 + k < pre_invalid) continue;
sum[loc_s0 + k * dst_step / 4] = sum_p[k];
}
sum_p = (__local float*)(&(lm_sum[1][bf_loc]));
for(int k = 0; k < 4; k++)
{
if(gid * 4 + k + 4 >= cols + pre_invalid) break;
sum[loc_s1 + k * dst_step / 4] = sum_p[k];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
}
kernel void integral_sum_rows_D5(__global float4 *srcsum,__global float *sum ,
int rows,int cols,int src_step,int sum_step,
int sum_offset)
{
unsigned int lid = get_local_id(0);
unsigned int gid = get_group_id(0);
float4 src_t[2], sum_t[2];
__local float4 lm_sum[2][LSIZE + LOG_LSIZE];
__local float *sum_p;
src_step = src_step >> 4;
for(int i = 0; i < rows; i =i + LSIZE_1)
{
src_t[0] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2] : (float4)0;
src_t[1] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2 + 1] : (float4)0;
sum_t[0] = (i == 0 ? (float4)0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
sum_t[1] = (i == 0 ? (float4)0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
barrier(CLK_LOCAL_MEM_FENCE);
int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
lm_sum[0][bf_loc] = src_t[0];
lm_sum[1][bf_loc] = src_t[1];
int offset = 1;
for(int d = LSIZE >> 1 ; d > 0; d>>=1)
{
barrier(CLK_LOCAL_MEM_FENCE);
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
ai += GET_CONFLICT_OFFSET(ai);
bi += GET_CONFLICT_OFFSET(bi);
if((lid & 127) < d)
{
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
}
offset <<= 1;
}
barrier(CLK_LOCAL_MEM_FENCE);
if(lid < 2)
{
lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
}
for(int d = 1; d < LSIZE; d <<= 1)
{
barrier(CLK_LOCAL_MEM_FENCE);
offset >>= 1;
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
ai += GET_CONFLICT_OFFSET(ai);
bi += GET_CONFLICT_OFFSET(bi);
if((lid & 127) < d)
{
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if(gid == 0 && (i + lid) <= rows)
{
sum[sum_offset + i + lid] = 0;
}
if(i + lid == 0)
{
int loc0 = gid * 2 * sum_step;
for(int k = 1; k <= 8; k++)
{
if(gid * 8 + k > cols) break;
sum[sum_offset + loc0 + k * sum_step / 4] = 0;
}
}
if(lid > 0 && (i+lid) <= rows)
{
int loc_s0 = sum_offset + gid * 2 * sum_step + sum_step / 4 + i + lid, loc_s1 = loc_s0 + sum_step ;
lm_sum[0][bf_loc] += sum_t[0];
lm_sum[1][bf_loc] += sum_t[1];
sum_p = (__local float*)(&(lm_sum[0][bf_loc]));
for(int k = 0; k < 4; k++)
{
if(gid * 8 + k >= cols) break;
sum[loc_s0 + k * sum_step / 4] = sum_p[k];
}
sum_p = (__local float*)(&(lm_sum[1][bf_loc]));
for(int k = 0; k < 4; k++)
{
if(gid * 8 + 4 + k >= cols) break;
sum[loc_s1 + k * sum_step / 4] = sum_p[k];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
}

@ -0,0 +1,381 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Pang Erping, erping@multicorewareinc.com
// Jia Haipeng, jiahaipeng95@gmail.com
// Peng Xiao, pengxiao@outlook.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
///////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////Macro for border type////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////////////
#ifdef BORDER_REPLICATE
//BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (l_edge) : (i))
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (r_edge)-1 : (addr))
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) : (i))
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (b_edge)-1 :(addr))
#endif
#ifdef BORDER_REFLECT
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? ((l_edge)<<1)-(i)-1 : (i))
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr))
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? ((t_edge)<<1)-(i)-1 : (i))
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr))
#endif
#ifdef BORDER_REFLECT_101
//BORDER_REFLECT_101: gfedcb|abcdefgh|gfedcba
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? ((l_edge)<<1)-(i) : (i))
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr))
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? ((t_edge)<<1)-(i) : (i))
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr))
#endif
#ifdef IMG_C_1_0
#define T_IMG uchar
#define T_IMGx4 uchar4
#define T_IMG_C1 uchar
#define CONVERT_TYPE convert_uchar_sat
#define CONVERT_TYPEx4 convert_uchar4_sat
#endif
#ifdef IMG_C_4_0
#define T_IMG uchar4
#define T_IMGx4 uchar16
#define T_IMG_C1 uchar
#define CONVERT_TYPE convert_uchar4_sat
#define CONVERT_TYPEx4 convert_uchar16_sat
#endif
#ifdef IMG_C_1_5
#define T_IMG float
#define T_IMGx4 float4
#define T_IMG_C1 float
#define CONVERT_TYPE convert_float
#define CONVERT_TYPEx4 convert_float4
#endif
#ifdef IMG_C_4_5
#define T_IMG float4
#define T_IMGx4 float16
#define T_IMG_C1 float
#define CONVERT_TYPE convert_float4
#define CONVERT_TYPEx4 convert_float16
#endif
#ifndef CN
#define CN 1
#endif
#if CN == 1
#define T_SUM float
#define T_SUMx4 float4
#define CONVERT_TYPE_SUM convert_float
#define CONVERT_TYPE_SUMx4 convert_float4
#define SUM_ZERO (0.0f)
#define SUM_ZEROx4 (0.0f, 0.0f, 0.0f, 0.0f)
#define VLOAD4 vload4
#define SX x
#define SY y
#define SZ z
#define SW w
#elif CN == 4
#define T_SUM float4
#define T_SUMx4 float16
#define CONVERT_TYPE_SUM convert_float4
#define CONVERT_TYPE_SUMx4 convert_float16
#define SUM_ZERO (0.0f, 0.0f, 0.0f, 0.0f)
#define SUM_ZEROx4 (0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f)
#define VLOAD4 vload16
#define SX s0123
#define SY s4567
#define SZ s89ab
#define SW scdef
#endif
#ifndef FILTER_SIZE
#define FILTER_SIZE 3
#endif
#define LOCAL_GROUP_SIZE 16
#define LOCAL_WIDTH ((FILTER_SIZE/2)*2 + LOCAL_GROUP_SIZE)
#define LOCAL_HEIGHT ((FILTER_SIZE/2)*2 + LOCAL_GROUP_SIZE)
#define FILTER_RADIUS (FILTER_SIZE >> 1)
__kernel void filter2D(
__global T_IMG *src,
__global T_IMG *dst,
int src_step,
int dst_step,
__constant float *mat_kernel,
__local T_IMG *local_data,
int wholerows,
int wholecols,
int src_offset_x,
int src_offset_y,
int dst_offset_x,
int dst_offset_y,
int cols,
int rows,
int operate_cols
)
{
int groupStartCol = get_group_id(0) * get_local_size(0);
int groupStartRow = get_group_id(1) * get_local_size(1);
int localCol = get_local_id(0);
int localRow = get_local_id(1);
int globalCol = groupStartCol + localCol;
int globalRow = groupStartRow + localRow;
const int src_offset = mad24(src_offset_y, src_step, src_offset_x);
const int dst_offset = mad24(dst_offset_y, dst_step, dst_offset_x);
#ifdef BORDER_CONSTANT
for(int i = localRow; i < LOCAL_HEIGHT; i += get_local_size(1))
{
int curRow = groupStartRow + i;
for(int j = localCol; j < LOCAL_WIDTH; j += get_local_size(0))
{
int curCol = groupStartCol + j;
if(curRow < FILTER_RADIUS - src_offset_y || (curRow - FILTER_RADIUS) >= wholerows - src_offset_y||
curCol < FILTER_RADIUS - src_offset_x || (curCol - FILTER_RADIUS) >= wholecols - src_offset_x)
{
local_data[(i) * LOCAL_WIDTH + j] = 0;
}
else
{
local_data[(i) * LOCAL_WIDTH + j] = src[(curRow - FILTER_RADIUS) * src_step + curCol - FILTER_RADIUS + src_offset];
}
}
}
#else
for(int i = localRow; i < LOCAL_HEIGHT; i += get_local_size(1))
{
int curRow = groupStartRow + i;
curRow = ADDR_H(curRow, FILTER_RADIUS - src_offset_y, wholerows - src_offset_y);
curRow = ADDR_B(curRow - FILTER_RADIUS, wholerows - src_offset_y, curRow - FILTER_RADIUS);
for(int j = localCol; j < LOCAL_WIDTH; j += get_local_size(0))
{
int curCol = groupStartCol + j;
curCol = ADDR_L(curCol, FILTER_RADIUS - src_offset_x, wholecols - src_offset_x);
curCol = ADDR_R(curCol - FILTER_RADIUS, wholecols - src_offset_x, curCol - FILTER_RADIUS);
if(curRow < wholerows && curCol < wholecols)
{
local_data[(i) * LOCAL_WIDTH + j] = src[(curRow) * src_step + curCol + src_offset];
}
}
}
#endif
barrier(CLK_LOCAL_MEM_FENCE);
if(globalRow < rows && globalCol < cols)
{
T_SUM sum = (T_SUM)(SUM_ZERO);
int filterIdx = 0;
for(int i = 0; i < FILTER_SIZE; i++)
{
int offset = (i + localRow) * LOCAL_WIDTH;
for(int j = 0; j < FILTER_SIZE; j++)
{
sum += CONVERT_TYPE_SUM(local_data[offset + j + localCol]) * mat_kernel[filterIdx++];
}
}
dst[(globalRow)*dst_step + (globalCol) + dst_offset] = CONVERT_TYPE(sum);
}
}
/// following is specific for 3x3 kernels
//////////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////Macro for define elements number per thread/////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
#define ANX 1
#define ANY 1
#define ROWS_PER_GROUP 4
#define ROWS_PER_GROUP_BITS 2
#define ROWS_FETCH (ROWS_PER_GROUP + ANY + ANY) //(ROWS_PER_GROUP + anY * 2)
#define THREADS_PER_ROW 64
#define THREADS_PER_ROW_BIT 6
#define ELEMENTS_PER_THREAD 4
#define ELEMENTS_PER_THREAD_BIT 2
#define LOCAL_MEM_STEP 260 //divup((get_local_size(0) + anX * 2), 4) * 4
///////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////8uC1////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel void filter2D_3x3(
__global T_IMG *src,
__global T_IMG *dst,
int src_step,
int dst_step,
__constant float *mat_kernel,
__local T_IMG *local_data,
int wholerows,
int wholecols,
int src_offset_x,
int src_offset_y,
int dst_offset_x,
int dst_offset_y,
int cols,
int rows,
int operate_cols
)
{
int gX = get_global_id(0);
int gY = get_global_id(1);
int lX = get_local_id(0);
int groupX_size = get_local_size(0);
int groupX_id = get_group_id(0);
#define dst_align (dst_offset_x & 3)
int cols_start_index_group = src_offset_x - dst_align + groupX_size * groupX_id - ANX;
int rows_start_index = src_offset_y + (gY << ROWS_PER_GROUP_BITS) - ANY;
if((gY << 2) < rows)
{
for(int i = 0; i < ROWS_FETCH; ++i)
{
if((rows_start_index - src_offset_y) + i < rows + ANY)
{
#ifdef BORDER_CONSTANT
int selected_row = rows_start_index + i;
int selected_cols = cols_start_index_group + lX;
T_IMG data = src[mad24(selected_row, src_step, selected_cols)];
int con = selected_row >= 0 && selected_row < wholerows && selected_cols >= 0 && selected_cols < wholecols;
data = con ? data : (T_IMG)(0);
local_data[mad24(i, LOCAL_MEM_STEP, lX)] = data;
if(lX < (ANX << 1))
{
selected_cols = cols_start_index_group + lX + groupX_size;
data = src[mad24(selected_row, src_step, selected_cols)];
con = selected_row >= 0 && selected_row < wholerows && selected_cols >= 0 && selected_cols < wholecols;
data = con ? data : (T_IMG)(0);
local_data[mad24(i, LOCAL_MEM_STEP, lX) + groupX_size] = data;
}
#else
int selected_row = ADDR_H(rows_start_index + i, 0, wholerows);
selected_row = ADDR_B(rows_start_index + i, wholerows, selected_row);
int selected_cols = ADDR_L(cols_start_index_group + lX, 0, wholecols);
selected_cols = ADDR_R(cols_start_index_group + lX, wholecols, selected_cols);
T_IMG data = src[mad24(selected_row, src_step, selected_cols)];
local_data[mad24(i, LOCAL_MEM_STEP, lX)] = data;
if(lX < (ANX << 1))
{
selected_cols = cols_start_index_group + lX + groupX_size;
selected_cols = ADDR_R(selected_cols, wholecols, selected_cols);
data = src[mad24(selected_row, src_step, selected_cols)];
local_data[mad24(i, LOCAL_MEM_STEP, lX) + groupX_size] = data;
}
#endif
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
int process_col = groupX_size * groupX_id + ((lX % THREADS_PER_ROW) << 2);
if(((gY << 2) < rows) && (process_col < operate_cols))
{
int dst_cols_start = dst_offset_x;
int dst_cols_end = dst_offset_x + cols;
int dst_cols_index = (dst_offset_x + process_col) & 0xfffffffc;
int dst_rows_end = dst_offset_y + rows;
int dst_rows_index = dst_offset_y + (gY << ROWS_PER_GROUP_BITS) + (lX >> THREADS_PER_ROW_BIT);
dst = dst + mad24(dst_rows_index, dst_step, dst_cols_index);
T_IMGx4 dst_data = *(__global T_IMGx4 *)dst;
T_SUMx4 sum = (T_SUMx4)SUM_ZEROx4;
T_IMGx4 data;
for(int i = 0; i < FILTER_SIZE; i++)
{
#pragma unroll
for(int j = 0; j < FILTER_SIZE; j++)
{
if(dst_rows_index < dst_rows_end)
{
int local_row = (lX >> THREADS_PER_ROW_BIT) + i;
int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j;
data = VLOAD4(0, (__local T_IMG_C1 *)(local_data + local_row * LOCAL_MEM_STEP + local_cols));
sum = sum + (mat_kernel[i * FILTER_SIZE + j] * CONVERT_TYPE_SUMx4(data));
}
}
}
if(dst_rows_index < dst_rows_end)
{
T_IMGx4 tmp_dst = CONVERT_TYPEx4(sum);
tmp_dst.SX = ((dst_cols_index + 0 >= dst_cols_start) && (dst_cols_index + 0 < dst_cols_end)) ?
tmp_dst.SX : dst_data.SX;
tmp_dst.SY = ((dst_cols_index + 1 >= dst_cols_start) && (dst_cols_index + 1 < dst_cols_end)) ?
tmp_dst.SY : dst_data.SY;
tmp_dst.SZ = ((dst_cols_index + 2 >= dst_cols_start) && (dst_cols_index + 2 < dst_cols_end)) ?
tmp_dst.SZ : dst_data.SZ;
tmp_dst.SW = ((dst_cols_index + 3 >= dst_cols_start) && (dst_cols_index + 3 < dst_cols_end)) ?
tmp_dst.SW : dst_data.SW;
*(__global T_IMGx4 *)dst = tmp_dst;
}
}
}

@ -0,0 +1,857 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Peng Xiao, pengxiao@multicorewareinc.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#pragma OPENCL EXTENSION cl_amd_printf : enable
#if defined (DOUBLE_SUPPORT)
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#elif defined (cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#endif
#define TYPE_IMAGE_SQSUM double
#else
#define TYPE_IMAGE_SQSUM float
#endif
#ifndef CN4
#define CN4 1
#else
#define CN4 4
#endif
//////////////////////////////////////////////////
// utilities
#define SQSUMS_PTR(ox, oy) mad24(gidy + oy, img_sqsums_step, (gidx + img_sqsums_offset + ox) * CN4)
#define SUMS_PTR(ox, oy) mad24(gidy + oy, img_sums_step, gidx + img_sums_offset + ox)
// normAcc* are accurate normalization routines which make GPU matchTemplate
// consistent with CPU one
float normAcc(float num, float denum)
{
if(fabs(num) < denum)
{
return num / denum;
}
if(fabs(num) < denum * 1.125f)
{
return num > 0 ? 1 : -1;
}
return 0;
}
float normAcc_SQDIFF(float num, float denum)
{
if(fabs(num) < denum)
{
return num / denum;
}
if(fabs(num) < denum * 1.125f)
{
return num > 0 ? 1 : -1;
}
return 1;
}
//////////////////////////////////////////////////////////////////////
// normalize
__kernel
void normalizeKernel_C1_D0
(
__global const float * img_sqsums,
__global float * res,
ulong tpl_sqsum,
int res_rows,
int res_cols,
int tpl_rows,
int tpl_cols,
int img_sqsums_offset,
int img_sqsums_step,
int res_offset,
int res_step
)
{
int gidx = get_global_id(0);
int gidy = get_global_id(1);
res_step /= sizeof(*res);
res_offset /= sizeof(*res);
img_sqsums_step /= sizeof(*img_sqsums);
img_sqsums_offset /= sizeof(*img_sqsums);
int res_idx = mad24(gidy, res_step, res_offset + gidx);
if(gidx < res_cols && gidy < res_rows)
{
float image_sqsum_ = (float)(
(img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) -
(img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)]));
res[res_idx] = normAcc(res[res_idx], sqrt(image_sqsum_ * tpl_sqsum));
}
}
__kernel
void matchTemplate_Prepared_SQDIFF_C1_D0
(
__global const TYPE_IMAGE_SQSUM * img_sqsums,
__global float * res,
ulong tpl_sqsum,
int res_rows,
int res_cols,
int tpl_rows,
int tpl_cols,
int img_sqsums_offset,
int img_sqsums_step,
int res_offset,
int res_step
)
{
int gidx = get_global_id(0);
int gidy = get_global_id(1);
res_step /= sizeof(*res);
res_offset /= sizeof(*res);
img_sqsums_step /= sizeof(*img_sqsums);
img_sqsums_offset /= sizeof(*img_sqsums);
int res_idx = mad24(gidy, res_step, res_offset + gidx);
if(gidx < res_cols && gidy < res_rows)
{
float image_sqsum_ = (float)(
(img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) -
(img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)]));
res[res_idx] = image_sqsum_ - 2.f * res[res_idx] + tpl_sqsum;
}
}
__kernel
void matchTemplate_Prepared_SQDIFF_NORMED_C1_D0
(
__global const float * img_sqsums,
__global float * res,
ulong tpl_sqsum,
int res_rows,
int res_cols,
int tpl_rows,
int tpl_cols,
int img_sqsums_offset,
int img_sqsums_step,
int res_offset,
int res_step
)
{
int gidx = get_global_id(0);
int gidy = get_global_id(1);
res_step /= sizeof(*res);
res_offset /= sizeof(*res);
img_sqsums_step /= sizeof(*img_sqsums);
img_sqsums_offset /= sizeof(*img_sqsums);
int res_idx = mad24(gidy, res_step, res_offset + gidx);
if(gidx < res_cols && gidy < res_rows)
{
float image_sqsum_ = (float)(
(img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) -
(img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)]));
res[res_idx] = normAcc_SQDIFF(image_sqsum_ - 2.f * res[res_idx] + tpl_sqsum,
sqrt(image_sqsum_ * tpl_sqsum));
}
}
//////////////////////////////////////////////////
// SQDIFF
__kernel
void matchTemplate_Naive_SQDIFF_C1_D0
(
__global const uchar * img,
__global const uchar * tpl,
__global float * res,
int img_rows,
int img_cols,
int tpl_rows,
int tpl_cols,
int res_rows,
int res_cols,
int img_offset,
int tpl_offset,
int res_offset,
int img_step,
int tpl_step,
int res_step
)
{
int gidx = get_global_id(0);
int gidy = get_global_id(1);
int i,j;
int delta;
int sum = 0;
res_step /= sizeof(*res);
res_offset /= sizeof(*res);
int res_idx = mad24(gidy, res_step, res_offset + gidx);
if(gidx < res_cols && gidy < res_rows)
{
for(i = 0; i < tpl_rows; i ++)
{
// get specific rows of img data
__global const uchar * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset);
__global const uchar * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
for(j = 0; j < tpl_cols; j ++)
{
delta = img_ptr[j] - tpl_ptr[j];
sum = mad24(delta, delta, sum);
}
}
res[res_idx] = sum;
}
}
__kernel
void matchTemplate_Naive_SQDIFF_C1_D5
(
__global const float * img,
__global const float * tpl,
__global float * res,
int img_rows,
int img_cols,
int tpl_rows,
int tpl_cols,
int res_rows,
int res_cols,
int img_offset,
int tpl_offset,
int res_offset,
int img_step,
int tpl_step,
int res_step
)
{
int gidx = get_global_id(0);
int gidy = get_global_id(1);
int i,j;
float delta;
float sum = 0;
img_step /= sizeof(*img);
img_offset /= sizeof(*img);
tpl_step /= sizeof(*tpl);
tpl_offset /= sizeof(*tpl);
res_step /= sizeof(*res);
res_offset /= sizeof(*res);
int res_idx = mad24(gidy, res_step, res_offset + gidx);
if(gidx < res_cols && gidy < res_rows)
{
for(i = 0; i < tpl_rows; i ++)
{
// get specific rows of img data
__global const float * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset);
__global const float * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
for(j = 0; j < tpl_cols; j ++)
{
delta = img_ptr[j] - tpl_ptr[j];
sum = mad(delta, delta, sum);
}
}
res[res_idx] = sum;
}
}
__kernel
void matchTemplate_Naive_SQDIFF_C4_D0
(
__global const uchar4 * img,
__global const uchar4 * tpl,
__global float * res,
int img_rows,
int img_cols,
int tpl_rows,
int tpl_cols,
int res_rows,
int res_cols,
int img_offset,
int tpl_offset,
int res_offset,
int img_step,
int tpl_step,
int res_step
)
{
int gidx = get_global_id(0);
int gidy = get_global_id(1);
int i,j;
int4 delta;
int4 sum = (int4)(0, 0, 0, 0);
img_step /= sizeof(*img);
img_offset /= sizeof(*img);
tpl_step /= sizeof(*tpl);
tpl_offset /= sizeof(*tpl);
res_step /= sizeof(*res);
res_offset /= sizeof(*res);
int res_idx = mad24(gidy, res_step, res_offset + gidx);
if(gidx < res_cols && gidy < res_rows)
{
for(i = 0; i < tpl_rows; i ++)
{
// get specific rows of img data
__global const uchar4 * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset);
__global const uchar4 * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
for(j = 0; j < tpl_cols; j ++)
{
//delta = convert_int4(img_ptr[j] - tpl_ptr[j]); // this alternative is incorrect
delta.x = img_ptr[j].x - tpl_ptr[j].x;
delta.y = img_ptr[j].y - tpl_ptr[j].y;
delta.z = img_ptr[j].z - tpl_ptr[j].z;
delta.w = img_ptr[j].w - tpl_ptr[j].w;
sum = mad24(delta, delta, sum);
}
}
res[res_idx] = sum.x + sum.y + sum.z + sum.w;
}
}
__kernel
void matchTemplate_Naive_SQDIFF_C4_D5
(
__global const float4 * img,
__global const float4 * tpl,
__global float * res,
int img_rows,
int img_cols,
int tpl_rows,
int tpl_cols,
int res_rows,
int res_cols,
int img_offset,
int tpl_offset,
int res_offset,
int img_step,
int tpl_step,
int res_step
)
{
int gidx = get_global_id(0);
int gidy = get_global_id(1);
int i,j;
float4 delta;
float4 sum = (float4)(0, 0, 0, 0);
img_step /= sizeof(*img);
img_offset /= sizeof(*img);
tpl_step /= sizeof(*tpl);
tpl_offset /= sizeof(*tpl);
res_step /= sizeof(*res);
res_offset /= sizeof(*res);
int res_idx = mad24(gidy, res_step, res_offset + gidx);
if(gidx < res_cols && gidy < res_rows)
{
for(i = 0; i < tpl_rows; i ++)
{
// get specific rows of img data
__global const float4 * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset);
__global const float4 * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
for(j = 0; j < tpl_cols; j ++)
{
//delta = convert_int4(img_ptr[j] - tpl_ptr[j]); // this alternative is incorrect
delta.x = img_ptr[j].x - tpl_ptr[j].x;
delta.y = img_ptr[j].y - tpl_ptr[j].y;
delta.z = img_ptr[j].z - tpl_ptr[j].z;
delta.w = img_ptr[j].w - tpl_ptr[j].w;
sum = mad(delta, delta, sum);
}
}
res[res_idx] = sum.x + sum.y + sum.z + sum.w;
}
}
//////////////////////////////////////////////////
// CCORR
__kernel
void matchTemplate_Naive_CCORR_C1_D0
(
__global const uchar * img,
__global const uchar * tpl,
__global float * res,
int img_rows,
int img_cols,
int tpl_rows,
int tpl_cols,
int res_rows,
int res_cols,
int img_offset,
int tpl_offset,
int res_offset,
int img_step,
int tpl_step,
int res_step
)
{
int gidx = get_global_id(0);
int gidy = get_global_id(1);
int i,j;
int sum = 0;
res_step /= sizeof(*res);
res_offset /= sizeof(*res);
int res_idx = mad24(gidy, res_step, res_offset + gidx);
if(gidx < res_cols && gidy < res_rows)
{
for(i = 0; i < tpl_rows; i ++)
{
// get specific rows of img data
__global const uchar * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset);
__global const uchar * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
for(j = 0; j < tpl_cols; j ++)
{
sum = mad24(convert_int(img_ptr[j]), convert_int(tpl_ptr[j]), sum);
}
}
res[res_idx] = (float)sum;
}
}
__kernel
void matchTemplate_Naive_CCORR_C1_D5
(
__global const float * img,
__global const float * tpl,
__global float * res,
int img_rows,
int img_cols,
int tpl_rows,
int tpl_cols,
int res_rows,
int res_cols,
int img_offset,
int tpl_offset,
int res_offset,
int img_step,
int tpl_step,
int res_step
)
{
int gidx = get_global_id(0);
int gidy = get_global_id(1);
int i,j;
float sum = 0;
img_step /= sizeof(*img);
img_offset /= sizeof(*img);
tpl_step /= sizeof(*tpl);
tpl_offset /= sizeof(*tpl);
res_step /= sizeof(*res);
res_offset /= sizeof(*res);
int res_idx = mad24(gidy, res_step, res_offset + gidx);
if(gidx < res_cols && gidy < res_rows)
{
for(i = 0; i < tpl_rows; i ++)
{
// get specific rows of img data
__global const float * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset);
__global const float * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
for(j = 0; j < tpl_cols; j ++)
{
sum = mad(img_ptr[j], tpl_ptr[j], sum);
}
}
res[res_idx] = sum;
}
}
__kernel
void matchTemplate_Naive_CCORR_C4_D0
(
__global const uchar4 * img,
__global const uchar4 * tpl,
__global float * res,
int img_rows,
int img_cols,
int tpl_rows,
int tpl_cols,
int res_rows,
int res_cols,
int img_offset,
int tpl_offset,
int res_offset,
int img_step,
int tpl_step,
int res_step
)
{
int gidx = get_global_id(0);
int gidy = get_global_id(1);
int i,j;
int4 sum = (int4)(0, 0, 0, 0);
img_step /= sizeof(*img);
img_offset /= sizeof(*img);
tpl_step /= sizeof(*tpl);
tpl_offset /= sizeof(*tpl);
res_step /= sizeof(*res);
res_offset /= sizeof(*res);
int res_idx = mad24(gidy, res_step, res_offset + gidx);
if(gidx < res_cols && gidy < res_rows)
{
for(i = 0; i < tpl_rows; i ++)
{
// get specific rows of img data
__global const uchar4 * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset);
__global const uchar4 * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
for(j = 0; j < tpl_cols; j ++)
{
sum = mad24(convert_int4(img_ptr[j]), convert_int4(tpl_ptr[j]), sum);
}
}
res[res_idx] = (float)(sum.x + sum.y + sum.z + sum.w);
}
}
__kernel
void matchTemplate_Naive_CCORR_C4_D5
(
__global const float4 * img,
__global const float4 * tpl,
__global float * res,
int img_rows,
int img_cols,
int tpl_rows,
int tpl_cols,
int res_rows,
int res_cols,
int img_offset,
int tpl_offset,
int res_offset,
int img_step,
int tpl_step,
int res_step
)
{
int gidx = get_global_id(0);
int gidy = get_global_id(1);
int i,j;
float4 sum = (float4)(0, 0, 0, 0);
img_step /= sizeof(*img);
img_offset /= sizeof(*img);
tpl_step /= sizeof(*tpl);
tpl_offset /= sizeof(*tpl);
res_step /= sizeof(*res);
res_offset /= sizeof(*res);
int res_idx = mad24(gidy, res_step, res_offset + gidx);
if(gidx < res_cols && gidy < res_rows)
{
for(i = 0; i < tpl_rows; i ++)
{
// get specific rows of img data
__global const float4 * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset);
__global const float4 * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
for(j = 0; j < tpl_cols; j ++)
{
sum = mad(convert_float4(img_ptr[j]), convert_float4(tpl_ptr[j]), sum);
}
}
res[res_idx] = sum.x + sum.y + sum.z + sum.w;
}
}
//////////////////////////////////////////////////
// CCOFF
__kernel
void matchTemplate_Prepared_CCOFF_C1_D0
(
__global float * res,
int img_rows,
int img_cols,
int tpl_rows,
int tpl_cols,
int res_rows,
int res_cols,
int res_offset,
int res_step,
__global const uint * img_sums,
int img_sums_offset,
int img_sums_step,
float tpl_sum
)
{
int gidx = get_global_id(0);
int gidy = get_global_id(1);
img_sums_offset /= sizeof(*img_sums);
img_sums_step /= sizeof(*img_sums);
res_step /= sizeof(*res);
res_offset /= sizeof(*res);
int res_idx = mad24(gidy, res_step, res_offset + gidx);
if(gidx < res_cols && gidy < res_rows)
{
float sum = (float)((img_sums[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums[SUMS_PTR(tpl_cols, 0)])
-(img_sums[SUMS_PTR(0, tpl_rows)] - img_sums[SUMS_PTR(0, 0)]));
res[res_idx] -= sum * tpl_sum;
}
}
__kernel
void matchTemplate_Prepared_CCOFF_C4_D0
(
__global float * res,
int img_rows,
int img_cols,
int tpl_rows,
int tpl_cols,
int res_rows,
int res_cols,
int res_offset,
int res_step,
__global const uint * img_sums_c0,
__global const uint * img_sums_c1,
__global const uint * img_sums_c2,
__global const uint * img_sums_c3,
int img_sums_offset,
int img_sums_step,
float tpl_sum_c0,
float tpl_sum_c1,
float tpl_sum_c2,
float tpl_sum_c3
)
{
int gidx = get_global_id(0);
int gidy = get_global_id(1);
img_sums_offset /= sizeof(*img_sums_c0);
img_sums_step /= sizeof(*img_sums_c0);
res_step /= sizeof(*res);
res_offset /= sizeof(*res);
int res_idx = mad24(gidy, res_step, res_offset + gidx);
if(gidx < res_cols && gidy < res_rows)
{
float ccorr = res[res_idx];
ccorr -= tpl_sum_c0*(float)(
(img_sums_c0[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c0[SUMS_PTR(tpl_cols, 0)])
- (img_sums_c0[SUMS_PTR(0, tpl_rows)] - img_sums_c0[SUMS_PTR(0, 0)]));
ccorr -= tpl_sum_c1*(float)(
(img_sums_c1[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c1[SUMS_PTR(tpl_cols, 0)])
- (img_sums_c1[SUMS_PTR(0, tpl_rows)] - img_sums_c1[SUMS_PTR(0, 0)]));
ccorr -= tpl_sum_c2*(float)(
(img_sums_c2[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c2[SUMS_PTR(tpl_cols, 0)])
- (img_sums_c2[SUMS_PTR(0, tpl_rows)] - img_sums_c2[SUMS_PTR(0, 0)]));
ccorr -= tpl_sum_c3*(float)(
(img_sums_c3[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c3[SUMS_PTR(tpl_cols, 0)])
- (img_sums_c3[SUMS_PTR(0, tpl_rows)] - img_sums_c3[SUMS_PTR(0, 0)]));
res[res_idx] = ccorr;
}
}
__kernel
void matchTemplate_Prepared_CCOFF_NORMED_C1_D0
(
__global float * res,
int img_rows,
int img_cols,
int tpl_rows,
int tpl_cols,
int res_rows,
int res_cols,
int res_offset,
int res_step,
float weight,
__global const uint * img_sums,
int img_sums_offset,
int img_sums_step,
__global const float * img_sqsums,
int img_sqsums_offset,
int img_sqsums_step,
float tpl_sum,
float tpl_sqsum
)
{
int gidx = get_global_id(0);
int gidy = get_global_id(1);
img_sqsums_step /= sizeof(*img_sqsums);
img_sqsums_offset /= sizeof(*img_sqsums);
img_sums_offset /= sizeof(*img_sums);
img_sums_step /= sizeof(*img_sums);
res_step /= sizeof(*res);
res_offset /= sizeof(*res);
int res_idx = mad24(gidy, res_step, res_offset + gidx);
if(gidx < res_cols && gidy < res_rows)
{
float image_sum_ = (float)(
(img_sums[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums[SUMS_PTR(tpl_cols, 0)])
- (img_sums[SUMS_PTR(0, tpl_rows)] - img_sums[SUMS_PTR(0, 0)]));
float image_sqsum_ = (float)(
(img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) -
(img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)]));
res[res_idx] = normAcc(res[res_idx] - image_sum_ * tpl_sum,
sqrt(tpl_sqsum * (image_sqsum_ - weight * image_sum_ * image_sum_)));
}
}
__kernel
void matchTemplate_Prepared_CCOFF_NORMED_C4_D0
(
__global float * res,
int img_rows,
int img_cols,
int tpl_rows,
int tpl_cols,
int res_rows,
int res_cols,
int res_offset,
int res_step,
float weight,
__global const uint * img_sums_c0,
__global const uint * img_sums_c1,
__global const uint * img_sums_c2,
__global const uint * img_sums_c3,
int img_sums_offset,
int img_sums_step,
__global const float * img_sqsums_c0,
__global const float * img_sqsums_c1,
__global const float * img_sqsums_c2,
__global const float * img_sqsums_c3,
int img_sqsums_offset,
int img_sqsums_step,
float tpl_sum_c0,
float tpl_sum_c1,
float tpl_sum_c2,
float tpl_sum_c3,
float tpl_sqsum
)
{
int gidx = get_global_id(0);
int gidy = get_global_id(1);
img_sqsums_step /= sizeof(*img_sqsums_c0);
img_sqsums_offset /= sizeof(*img_sqsums_c0);
img_sums_offset /= sizeof(*img_sums_c0);
img_sums_step /= sizeof(*img_sums_c0);
res_step /= sizeof(*res);
res_offset /= sizeof(*res);
int res_idx = mad24(gidy, res_step, res_offset + gidx);
if(gidx < res_cols && gidy < res_rows)
{
float image_sum_c0 = (float)(
(img_sums_c0[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c0[SUMS_PTR(tpl_cols, 0)])
- (img_sums_c0[SUMS_PTR(0, tpl_rows)] - img_sums_c0[SUMS_PTR(0, 0)]));
float image_sum_c1 = (float)(
(img_sums_c1[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c1[SUMS_PTR(tpl_cols, 0)])
- (img_sums_c1[SUMS_PTR(0, tpl_rows)] - img_sums_c1[SUMS_PTR(0, 0)]));
float image_sum_c2 = (float)(
(img_sums_c2[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c2[SUMS_PTR(tpl_cols, 0)])
- (img_sums_c2[SUMS_PTR(0, tpl_rows)] - img_sums_c2[SUMS_PTR(0, 0)]));
float image_sum_c3 = (float)(
(img_sums_c3[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c3[SUMS_PTR(tpl_cols, 0)])
- (img_sums_c3[SUMS_PTR(0, tpl_rows)] - img_sums_c3[SUMS_PTR(0, 0)]));
float image_sqsum_c0 = (float)(
(img_sqsums_c0[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c0[SQSUMS_PTR(tpl_cols, 0)]) -
(img_sqsums_c0[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c0[SQSUMS_PTR(0, 0)]));
float image_sqsum_c1 = (float)(
(img_sqsums_c1[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c1[SQSUMS_PTR(tpl_cols, 0)]) -
(img_sqsums_c1[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c1[SQSUMS_PTR(0, 0)]));
float image_sqsum_c2 = (float)(
(img_sqsums_c2[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c2[SQSUMS_PTR(tpl_cols, 0)]) -
(img_sqsums_c2[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c2[SQSUMS_PTR(0, 0)]));
float image_sqsum_c3 = (float)(
(img_sqsums_c3[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c3[SQSUMS_PTR(tpl_cols, 0)]) -
(img_sqsums_c3[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c3[SQSUMS_PTR(0, 0)]));
float num = res[res_idx] -
image_sum_c0 * tpl_sum_c0 -
image_sum_c1 * tpl_sum_c1 -
image_sum_c2 * tpl_sum_c2 -
image_sum_c3 * tpl_sum_c3;
float denum = sqrt( tpl_sqsum * (
image_sqsum_c0 - weight * image_sum_c0 * image_sum_c0 +
image_sqsum_c1 - weight * image_sum_c1 * image_sum_c1 +
image_sqsum_c2 - weight * image_sum_c2 * image_sum_c2 +
image_sqsum_c3 - weight * image_sum_c0 * image_sum_c3)
);
res[res_idx] = normAcc(num, denum);
}
}
//////////////////////////////////////////////////////////////////////
// extractFirstChannel
__kernel
void extractFirstChannel
(
const __global float4* img,
__global float* res,
int rows,
int cols,
int img_offset,
int res_offset,
int img_step,
int res_step
)
{
img_step /= sizeof(float4);
res_step /= sizeof(float);
img_offset /= sizeof(float4);
res_offset /= sizeof(float);
img += img_offset;
res += res_offset;
int gidx = get_global_id(0);
int gidy = get_global_id(1);
if(gidx < cols && gidy < rows)
{
res[gidx + gidy * res_step] = img[gidx + gidy * img_step].x;
}
}

@ -0,0 +1,486 @@
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Niko Li, newlife20080214@gmail.com
// Zero Lin, zero.lin@amd.com
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//
/*
__kernel void medianFilter_C1(__global uchar * src, __global uchar * dst, int srcOffset, int dstOffset, int cols,
int rows, int srcStep, int dstStep, int m)
{
int dx = get_global_id(0)-(m>>1);
int dy = get_global_id(1)-(m>>1);
short histom[256];
for(int i=0;i<256;++i)
histom[i]=0;
for(int i=0;i<m;++i)
{
__global uchar * data = src + srcOffset + mul24(srcStep,clamp(dy + (i), 0, rows-1));
for(int j=dx;j<dx+m;++j)
{
histom[data[clamp(j, 0, cols-1)]]++;
}
}
int now=0;
int goal=(m*m+1)>>1;
int v;
for(int i=0;i<256;++i)
{
v=(now<goal?i:v);
now+=histom[i];
}
if(dy<rows && dx<cols)
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=v;
}
*/
#define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);}
__kernel void medianFilter3_C4_D0(__global uchar4 * src, __global uchar4 * dst, int srcOffset, int dstOffset, int cols,
int rows, int srcStep, int dstStep)
{
__local uchar4 data[18][18];
__global uchar4* source=src + srcOffset;
int dx = get_global_id(0) - get_local_id(0) -1;
int dy = get_global_id(1) - get_local_id(1) -1;
const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 9*18-1);
int dr=id/18;
int dc=id%18;
int r=clamp(dy+dr, 0, rows-1);
int c=clamp(dx+dc, 0, cols-1);
data[dr][dc] = source[r*srcStep + c];
r=clamp(dy+dr+9, 0, rows-1);
data[dr+9][dc] = source[r*srcStep + c];
barrier(CLK_LOCAL_MEM_FENCE);
int x =get_local_id(0);
int y =get_local_id(1);
uchar4 p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2];
uchar4 p3=data[y+1][x], p4=data[y+1][x+1], p5=data[y+1][x+2];
uchar4 p6=data[y+2][x], p7=data[y+2][x+1], p8=data[y+2][x+2];
uchar4 mid;
op(p1, p2); op(p4, p5); op(p7, p8); op(p0, p1);
op(p3, p4); op(p6, p7); op(p1, p2); op(p4, p5);
op(p7, p8); op(p0, p3); op(p5, p8); op(p4, p7);
op(p3, p6); op(p1, p4); op(p2, p5); op(p4, p7);
op(p4, p2); op(p6, p4); op(p4, p2);
if(get_global_id(1)<rows && get_global_id(0)<cols)
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p4;
}
#undef op(a,b)
#define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);}
__kernel void medianFilter3_C1_D0(__global uchar * src, __global uchar * dst, int srcOffset, int dstOffset, int cols,
int rows, int srcStep, int dstStep)
{
__local uchar data[18][18];
__global uchar* source=src + srcOffset;
int dx = get_global_id(0) - get_local_id(0) -1;
int dy = get_global_id(1) - get_local_id(1) -1;
const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 9*18-1);
int dr=id/18;
int dc=id%18;
int r=clamp(dy+dr, 0, rows-1);
int c=clamp(dx+dc, 0, cols-1);
data[dr][dc] = source[r*srcStep + c];
r=clamp(dy+dr+9, 0, rows-1);
data[dr+9][dc] = source[r*srcStep + c];
barrier(CLK_LOCAL_MEM_FENCE);
int x =get_local_id(0);
int y =get_local_id(1);
uchar p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2];
uchar p3=data[y+1][x], p4=data[y+1][x+1], p5=data[y+1][x+2];
uchar p6=data[y+2][x], p7=data[y+2][x+1], p8=data[y+2][x+2];
uchar mid;
op(p1, p2); op(p4, p5); op(p7, p8); op(p0, p1);
op(p3, p4); op(p6, p7); op(p1, p2); op(p4, p5);
op(p7, p8); op(p0, p3); op(p5, p8); op(p4, p7);
op(p3, p6); op(p1, p4); op(p2, p5); op(p4, p7);
op(p4, p2); op(p6, p4); op(p4, p2);
if(get_global_id(1)<rows && get_global_id(0)<cols)
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p4;
}
#undef op(a,b)
#define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);}
__kernel void medianFilter3_C1_D5(__global float * src, __global float * dst, int srcOffset, int dstOffset, int cols,
int rows, int srcStep, int dstStep)
{
__local float data[18][18];
__global float* source=src + srcOffset;
int dx = get_global_id(0) - get_local_id(0) -1;
int dy = get_global_id(1) - get_local_id(1) -1;
const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 9*18-1);
int dr=id/18;
int dc=id%18;
int r=clamp(dy+dr, 0, rows-1);
int c=clamp(dx+dc, 0, cols-1);
data[dr][dc] = source[r*srcStep + c];
r=clamp(dy+dr+9, 0, rows-1);
data[dr+9][dc] = source[r*srcStep + c];
barrier(CLK_LOCAL_MEM_FENCE);
int x =get_local_id(0);
int y =get_local_id(1);
float p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2];
float p3=data[y+1][x], p4=data[y+1][x+1], p5=data[y+1][x+2];
float p6=data[y+2][x], p7=data[y+2][x+1], p8=data[y+2][x+2];
float mid;
op(p1, p2); op(p4, p5); op(p7, p8); op(p0, p1);
op(p3, p4); op(p6, p7); op(p1, p2); op(p4, p5);
op(p7, p8); op(p0, p3); op(p5, p8); op(p4, p7);
op(p3, p6); op(p1, p4); op(p2, p5); op(p4, p7);
op(p4, p2); op(p6, p4); op(p4, p2);
if(get_global_id(1)<rows && get_global_id(0)<cols)
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p4;
}
#undef op(a,b)
#define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);}
__kernel void medianFilter3_C4_D5(__global float4 * src, __global float4 * dst, int srcOffset, int dstOffset, int cols,
int rows, int srcStep, int dstStep)
{
__local float4 data[18][18];
__global float4* source=src + srcOffset;
int dx = get_global_id(0) - get_local_id(0) -1;
int dy = get_global_id(1) - get_local_id(1) -1;
const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 9*18-1);
int dr=id/18;
int dc=id%18;
int r=clamp(dy+dr, 0, rows-1);
int c=clamp(dx+dc, 0, cols-1);
data[dr][dc] = source[r*srcStep + c];
r=clamp(dy+dr+9, 0, rows-1);
data[dr+9][dc] = source[r*srcStep + c];
barrier(CLK_LOCAL_MEM_FENCE);
int x =get_local_id(0);
int y =get_local_id(1);
float4 p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2];
float4 p3=data[y+1][x], p4=data[y+1][x+1], p5=data[y+1][x+2];
float4 p6=data[y+2][x], p7=data[y+2][x+1], p8=data[y+2][x+2];
float4 mid;
op(p1, p2); op(p4, p5); op(p7, p8); op(p0, p1);
op(p3, p4); op(p6, p7); op(p1, p2); op(p4, p5);
op(p7, p8); op(p0, p3); op(p5, p8); op(p4, p7);
op(p3, p6); op(p1, p4); op(p2, p5); op(p4, p7);
op(p4, p2); op(p6, p4); op(p4, p2);
if(get_global_id(1)<rows && get_global_id(0)<cols)
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p4;
}
#undef op(a,b)
#define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);}
__kernel void medianFilter5_C4_D0(__global uchar4 * src, __global uchar4 * dst, int srcOffset, int dstOffset, int cols,
int rows, int srcStep, int dstStep)
{
__local uchar4 data[20][20];
__global uchar4* source=src + srcOffset;
int dx = get_global_id(0) - get_local_id(0) -2;
int dy = get_global_id(1) - get_local_id(1) -2;
const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 10*20-1);
int dr=id/20;
int dc=id%20;
int r=clamp(dy+dr, 0, rows-1);
int c=clamp(dx+dc, 0, cols-1);
data[dr][dc] = source[r*srcStep + c];
r=clamp(dy+dr+10, 0, rows-1);
data[dr+10][dc] = source[r*srcStep + c];
barrier(CLK_LOCAL_MEM_FENCE);
int x =get_local_id(0);
int y =get_local_id(1);
uchar4 p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2], p3=data[y][x+3], p4=data[y][x+4];
uchar4 p5=data[y+1][x], p6=data[y+1][x+1], p7=data[y+1][x+2], p8=data[y+1][x+3], p9=data[y+1][x+4];
uchar4 p10=data[y+2][x], p11=data[y+2][x+1], p12=data[y+2][x+2], p13=data[y+2][x+3], p14=data[y+2][x+4];
uchar4 p15=data[y+3][x], p16=data[y+3][x+1], p17=data[y+3][x+2], p18=data[y+3][x+3], p19=data[y+3][x+4];
uchar4 p20=data[y+4][x], p21=data[y+4][x+1], p22=data[y+4][x+2], p23=data[y+4][x+3], p24=data[y+4][x+4];
uchar4 mid;
op(p1, p2); op(p0, p1); op(p1, p2); op(p4, p5); op(p3, p4);
op(p4, p5); op(p0, p3); op(p2, p5); op(p2, p3); op(p1, p4);
op(p1, p2); op(p3, p4); op(p7, p8); op(p6, p7); op(p7, p8);
op(p10, p11); op(p9, p10); op(p10, p11); op(p6, p9); op(p8, p11);
op(p8, p9); op(p7, p10); op(p7, p8); op(p9, p10); op(p0, p6);
op(p4, p10); op(p4, p6); op(p2, p8); op(p2, p4); op(p6, p8);
op(p1, p7); op(p5, p11); op(p5, p7); op(p3, p9); op(p3, p5);
op(p7, p9); op(p1, p2); op(p3, p4); op(p5, p6); op(p7, p8);
op(p9, p10); op(p13, p14); op(p12, p13); op(p13, p14); op(p16, p17);
op(p15, p16); op(p16, p17); op(p12, p15); op(p14, p17); op(p14, p15);
op(p13, p16); op(p13, p14); op(p15, p16); op(p19, p20); op(p18, p19);
op(p19, p20); op(p21, p22); op(p23, p24); op(p21, p23); op(p22, p24);
op(p22, p23); op(p18, p21); op(p20, p23); op(p20, p21); op(p19, p22);
op(p22, p24); op(p19, p20); op(p21, p22); op(p23, p24); op(p12, p18);
op(p16, p22); op(p16, p18); op(p14, p20); op(p20, p24); op(p14, p16);
op(p18, p20); op(p22, p24); op(p13, p19); op(p17, p23); op(p17, p19);
op(p15, p21); op(p15, p17); op(p19, p21); op(p13, p14); op(p15, p16);
op(p17, p18); op(p19, p20); op(p21, p22); op(p23, p24); op(p0, p12);
op(p8, p20); op(p8, p12); op(p4, p16); op(p16, p24); op(p12, p16);
op(p2, p14); op(p10, p22); op(p10, p14); op(p6, p18); op(p6, p10);
op(p10, p12); op(p1, p13); op(p9, p21); op(p9, p13); op(p5, p17);
op(p13, p17); op(p3, p15); op(p11, p23); op(p11, p15); op(p7, p19);
op(p7, p11); op(p11, p13); op(p11, p12);
if(get_global_id(1)<rows && get_global_id(0)<cols)
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12;
}
#undef op(a,b)
#define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);}
__kernel void medianFilter5_C1_D0(__global uchar * src, __global uchar * dst, int srcOffset, int dstOffset, int cols,
int rows, int srcStep, int dstStep)
{
__local uchar data[20][20];
__global uchar* source=src + srcOffset;
int dx = get_global_id(0) - get_local_id(0) -2;
int dy = get_global_id(1) - get_local_id(1) -2;
const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 10*20-1);
int dr=id/20;
int dc=id%20;
int r=clamp(dy+dr, 0, rows-1);
int c=clamp(dx+dc, 0, cols-1);
data[dr][dc] = source[r*srcStep + c];
r=clamp(dy+dr+10, 0, rows-1);
data[dr+10][dc] = source[r*srcStep + c];
barrier(CLK_LOCAL_MEM_FENCE);
int x =get_local_id(0);
int y =get_local_id(1);
uchar p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2], p3=data[y][x+3], p4=data[y][x+4];
uchar p5=data[y+1][x], p6=data[y+1][x+1], p7=data[y+1][x+2], p8=data[y+1][x+3], p9=data[y+1][x+4];
uchar p10=data[y+2][x], p11=data[y+2][x+1], p12=data[y+2][x+2], p13=data[y+2][x+3], p14=data[y+2][x+4];
uchar p15=data[y+3][x], p16=data[y+3][x+1], p17=data[y+3][x+2], p18=data[y+3][x+3], p19=data[y+3][x+4];
uchar p20=data[y+4][x], p21=data[y+4][x+1], p22=data[y+4][x+2], p23=data[y+4][x+3], p24=data[y+4][x+4];
uchar mid;
op(p1, p2); op(p0, p1); op(p1, p2); op(p4, p5); op(p3, p4);
op(p4, p5); op(p0, p3); op(p2, p5); op(p2, p3); op(p1, p4);
op(p1, p2); op(p3, p4); op(p7, p8); op(p6, p7); op(p7, p8);
op(p10, p11); op(p9, p10); op(p10, p11); op(p6, p9); op(p8, p11);
op(p8, p9); op(p7, p10); op(p7, p8); op(p9, p10); op(p0, p6);
op(p4, p10); op(p4, p6); op(p2, p8); op(p2, p4); op(p6, p8);
op(p1, p7); op(p5, p11); op(p5, p7); op(p3, p9); op(p3, p5);
op(p7, p9); op(p1, p2); op(p3, p4); op(p5, p6); op(p7, p8);
op(p9, p10); op(p13, p14); op(p12, p13); op(p13, p14); op(p16, p17);
op(p15, p16); op(p16, p17); op(p12, p15); op(p14, p17); op(p14, p15);
op(p13, p16); op(p13, p14); op(p15, p16); op(p19, p20); op(p18, p19);
op(p19, p20); op(p21, p22); op(p23, p24); op(p21, p23); op(p22, p24);
op(p22, p23); op(p18, p21); op(p20, p23); op(p20, p21); op(p19, p22);
op(p22, p24); op(p19, p20); op(p21, p22); op(p23, p24); op(p12, p18);
op(p16, p22); op(p16, p18); op(p14, p20); op(p20, p24); op(p14, p16);
op(p18, p20); op(p22, p24); op(p13, p19); op(p17, p23); op(p17, p19);
op(p15, p21); op(p15, p17); op(p19, p21); op(p13, p14); op(p15, p16);
op(p17, p18); op(p19, p20); op(p21, p22); op(p23, p24); op(p0, p12);
op(p8, p20); op(p8, p12); op(p4, p16); op(p16, p24); op(p12, p16);
op(p2, p14); op(p10, p22); op(p10, p14); op(p6, p18); op(p6, p10);
op(p10, p12); op(p1, p13); op(p9, p21); op(p9, p13); op(p5, p17);
op(p13, p17); op(p3, p15); op(p11, p23); op(p11, p15); op(p7, p19);
op(p7, p11); op(p11, p13); op(p11, p12);
if(get_global_id(1)<rows && get_global_id(0)<cols)
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12;
}
#undef op(a,b)
#define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);}
__kernel void medianFilter5_C4_D5(__global float4 * src, __global float4 * dst, int srcOffset, int dstOffset, int cols,
int rows, int srcStep, int dstStep)
{
__local float4 data[20][20];
__global float4* source=src + srcOffset;
int dx = get_global_id(0) - get_local_id(0) -2;
int dy = get_global_id(1) - get_local_id(1) -2;
const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 10*20-1);
int dr=id/20;
int dc=id%20;
int r=clamp(dy+dr, 0, rows-1);
int c=clamp(dx+dc, 0, cols-1);
data[dr][dc] = source[r*srcStep + c];
r=clamp(dy+dr+10, 0, rows-1);
data[dr+10][dc] = source[r*srcStep + c];
barrier(CLK_LOCAL_MEM_FENCE);
int x =get_local_id(0);
int y =get_local_id(1);
float4 p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2], p3=data[y][x+3], p4=data[y][x+4];
float4 p5=data[y+1][x], p6=data[y+1][x+1], p7=data[y+1][x+2], p8=data[y+1][x+3], p9=data[y+1][x+4];
float4 p10=data[y+2][x], p11=data[y+2][x+1], p12=data[y+2][x+2], p13=data[y+2][x+3], p14=data[y+2][x+4];
float4 p15=data[y+3][x], p16=data[y+3][x+1], p17=data[y+3][x+2], p18=data[y+3][x+3], p19=data[y+3][x+4];
float4 p20=data[y+4][x], p21=data[y+4][x+1], p22=data[y+4][x+2], p23=data[y+4][x+3], p24=data[y+4][x+4];
float4 mid;
op(p1, p2); op(p0, p1); op(p1, p2); op(p4, p5); op(p3, p4);
op(p4, p5); op(p0, p3); op(p2, p5); op(p2, p3); op(p1, p4);
op(p1, p2); op(p3, p4); op(p7, p8); op(p6, p7); op(p7, p8);
op(p10, p11); op(p9, p10); op(p10, p11); op(p6, p9); op(p8, p11);
op(p8, p9); op(p7, p10); op(p7, p8); op(p9, p10); op(p0, p6);
op(p4, p10); op(p4, p6); op(p2, p8); op(p2, p4); op(p6, p8);
op(p1, p7); op(p5, p11); op(p5, p7); op(p3, p9); op(p3, p5);
op(p7, p9); op(p1, p2); op(p3, p4); op(p5, p6); op(p7, p8);
op(p9, p10); op(p13, p14); op(p12, p13); op(p13, p14); op(p16, p17);
op(p15, p16); op(p16, p17); op(p12, p15); op(p14, p17); op(p14, p15);
op(p13, p16); op(p13, p14); op(p15, p16); op(p19, p20); op(p18, p19);
op(p19, p20); op(p21, p22); op(p23, p24); op(p21, p23); op(p22, p24);
op(p22, p23); op(p18, p21); op(p20, p23); op(p20, p21); op(p19, p22);
op(p22, p24); op(p19, p20); op(p21, p22); op(p23, p24); op(p12, p18);
op(p16, p22); op(p16, p18); op(p14, p20); op(p20, p24); op(p14, p16);
op(p18, p20); op(p22, p24); op(p13, p19); op(p17, p23); op(p17, p19);
op(p15, p21); op(p15, p17); op(p19, p21); op(p13, p14); op(p15, p16);
op(p17, p18); op(p19, p20); op(p21, p22); op(p23, p24); op(p0, p12);
op(p8, p20); op(p8, p12); op(p4, p16); op(p16, p24); op(p12, p16);
op(p2, p14); op(p10, p22); op(p10, p14); op(p6, p18); op(p6, p10);
op(p10, p12); op(p1, p13); op(p9, p21); op(p9, p13); op(p5, p17);
op(p13, p17); op(p3, p15); op(p11, p23); op(p11, p15); op(p7, p19);
op(p7, p11); op(p11, p13); op(p11, p12);
if(get_global_id(1)<rows && get_global_id(0)<cols)
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12;
}
#undef op(a,b)
#define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);}
__kernel void medianFilter5_C1_D5(__global float * src, __global float * dst, int srcOffset, int dstOffset, int cols,
int rows, int srcStep, int dstStep)
{
__local float data[20][20];
__global float* source=src + srcOffset;
int dx = get_global_id(0) - get_local_id(0) -2;
int dy = get_global_id(1) - get_local_id(1) -2;
const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 10*20-1);
int dr=id/20;
int dc=id%20;
int r=clamp(dy+dr, 0, rows-1);
int c=clamp(dx+dc, 0, cols-1);
data[dr][dc] = source[r*srcStep + c];
r=clamp(dy+dr+10, 0, rows-1);
data[dr+10][dc] = source[r*srcStep + c];
barrier(CLK_LOCAL_MEM_FENCE);
int x =get_local_id(0);
int y =get_local_id(1);
float p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2], p3=data[y][x+3], p4=data[y][x+4];
float p5=data[y+1][x], p6=data[y+1][x+1], p7=data[y+1][x+2], p8=data[y+1][x+3], p9=data[y+1][x+4];
float p10=data[y+2][x], p11=data[y+2][x+1], p12=data[y+2][x+2], p13=data[y+2][x+3], p14=data[y+2][x+4];
float p15=data[y+3][x], p16=data[y+3][x+1], p17=data[y+3][x+2], p18=data[y+3][x+3], p19=data[y+3][x+4];
float p20=data[y+4][x], p21=data[y+4][x+1], p22=data[y+4][x+2], p23=data[y+4][x+3], p24=data[y+4][x+4];
float mid;
op(p1, p2); op(p0, p1); op(p1, p2); op(p4, p5); op(p3, p4);
op(p4, p5); op(p0, p3); op(p2, p5); op(p2, p3); op(p1, p4);
op(p1, p2); op(p3, p4); op(p7, p8); op(p6, p7); op(p7, p8);
op(p10, p11); op(p9, p10); op(p10, p11); op(p6, p9); op(p8, p11);
op(p8, p9); op(p7, p10); op(p7, p8); op(p9, p10); op(p0, p6);
op(p4, p10); op(p4, p6); op(p2, p8); op(p2, p4); op(p6, p8);
op(p1, p7); op(p5, p11); op(p5, p7); op(p3, p9); op(p3, p5);
op(p7, p9); op(p1, p2); op(p3, p4); op(p5, p6); op(p7, p8);
op(p9, p10); op(p13, p14); op(p12, p13); op(p13, p14); op(p16, p17);
op(p15, p16); op(p16, p17); op(p12, p15); op(p14, p17); op(p14, p15);
op(p13, p16); op(p13, p14); op(p15, p16); op(p19, p20); op(p18, p19);
op(p19, p20); op(p21, p22); op(p23, p24); op(p21, p23); op(p22, p24);
op(p22, p23); op(p18, p21); op(p20, p23); op(p20, p21); op(p19, p22);
op(p22, p24); op(p19, p20); op(p21, p22); op(p23, p24); op(p12, p18);
op(p16, p22); op(p16, p18); op(p14, p20); op(p20, p24); op(p14, p16);
op(p18, p20); op(p22, p24); op(p13, p19); op(p17, p23); op(p17, p19);
op(p15, p21); op(p15, p17); op(p19, p21); op(p13, p14); op(p15, p16);
op(p17, p18); op(p19, p20); op(p21, p22); op(p23, p24); op(p0, p12);
op(p8, p20); op(p8, p12); op(p4, p16); op(p16, p24); op(p12, p16);
op(p2, p14); op(p10, p22); op(p10, p14); op(p6, p18); op(p6, p10);
op(p10, p12); op(p1, p13); op(p9, p21); op(p9, p13); op(p5, p17);
op(p13, p17); op(p3, p15); op(p11, p23); op(p11, p15); op(p7, p19);
op(p7, p11); op(p11, p13); op(p11, p12);
if(get_global_id(1)<rows && get_global_id(0)<cols)
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12;
}
#undef op(a,b)

@ -0,0 +1,207 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Shengen Yan,yanshengen@gmail.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if defined (DOUBLE_SUPPORT)
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#endif
///////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////Macro for border type////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////////////
#ifdef BORDER_REPLICATE
//BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (l_edge) : (i))
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (r_edge)-1 : (addr))
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) :(i))
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (b_edge)-1 :(addr))
#endif
#ifdef BORDER_REFLECT
//BORDER_REFLECT: fedcba|abcdefgh|hgfedcb
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i)-1 : (i))
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr))
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i)-1 : (i))
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr))
#endif
#ifdef BORDER_REFLECT101
//BORDER_REFLECT101: gfedcb|abcdefgh|gfedcba
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i) : (i))
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr))
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i) : (i))
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr))
#endif
#ifdef BORDER_WRAP
//BORDER_WRAP: cdefgh|abcdefgh|abcdefg
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (i)+(r_edge) : (i))
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (i)-(r_edge) : (addr))
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (i)+(b_edge) : (i))
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (i)-(b_edge) : (addr))
#endif
#define THREADS 256
#define ELEM(i, l_edge, r_edge, elem1, elem2) (i) >= (l_edge) && (i) < (r_edge) ? (elem1) : (elem2)
///////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////calcHarris////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////
__kernel void calcMinEigenVal(__global const float *Dx,__global const float *Dy, __global float *dst,
int dx_offset, int dx_whole_rows, int dx_whole_cols, int dx_step,
int dy_offset, int dy_whole_rows, int dy_whole_cols, int dy_step,
int dst_offset, int dst_rows, int dst_cols, int dst_step,
float k)
{
int col = get_local_id(0);
const int gX = get_group_id(0);
const int gY = get_group_id(1);
const int glx = get_global_id(0);
const int gly = get_global_id(1);
int dx_x_off = (dx_offset % dx_step) >> 2;
int dx_y_off = dx_offset / dx_step;
int dy_x_off = (dy_offset % dy_step) >> 2;
int dy_y_off = dy_offset / dy_step;
int dst_x_off = (dst_offset % dst_step) >> 2;
int dst_y_off = dst_offset / dst_step;
int dx_startX = gX * (THREADS-ksX+1) - anX + dx_x_off;
int dx_startY = (gY << 1) - anY + dx_y_off;
int dy_startX = gX * (THREADS-ksX+1) - anX + dy_x_off;
int dy_startY = (gY << 1) - anY + dy_y_off;
int dst_startX = gX * (THREADS-ksX+1) + dst_x_off;
int dst_startY = (gY << 1) + dst_y_off;
float dx_data[ksY+1],dy_data[ksY+1],data[3][ksY+1];
__local float temp[6][THREADS];
#ifdef BORDER_CONSTANT
bool dx_con,dy_con;
float dx_s,dy_s;
for(int i=0; i < ksY+1; i++)
{
dx_con = dx_startX+col >= 0 && dx_startX+col < dx_whole_cols && dx_startY+i >= 0 && dx_startY+i < dx_whole_rows;
dx_s = Dx[(dx_startY+i)*(dx_step>>2)+(dx_startX+col)];
dx_data[i] = dx_con ? dx_s : 0.0;
dy_con = dy_startX+col >= 0 && dy_startX+col < dy_whole_cols && dy_startY+i >= 0 && dy_startY+i < dy_whole_rows;
dy_s = Dy[(dy_startY+i)*(dy_step>>2)+(dy_startX+col)];
dy_data[i] = dy_con ? dy_s : 0.0;
data[0][i] = dx_data[i] * dx_data[i];
data[1][i] = dx_data[i] * dy_data[i];
data[2][i] = dy_data[i] * dy_data[i];
}
#else
int clamped_col = min(dst_cols, col);
for(int i=0; i < ksY+1; i++)
{
int dx_selected_row;
int dx_selected_col;
dx_selected_row = ADDR_H(dx_startY+i, 0, dx_whole_rows);
dx_selected_row = ADDR_B(dx_startY+i, dx_whole_rows, dx_selected_row);
dx_selected_col = ADDR_L(dx_startX+clamped_col, 0, dx_whole_cols);
dx_selected_col = ADDR_R(dx_startX+clamped_col, dx_whole_cols, dx_selected_col);
dx_data[i] = Dx[dx_selected_row * (dx_step>>2) + dx_selected_col];
int dy_selected_row;
int dy_selected_col;
dy_selected_row = ADDR_H(dy_startY+i, 0, dy_whole_rows);
dy_selected_row = ADDR_B(dy_startY+i, dy_whole_rows, dy_selected_row);
dy_selected_col = ADDR_L(dy_startX+clamped_col, 0, dy_whole_cols);
dy_selected_col = ADDR_R(dy_startX+clamped_col, dy_whole_cols, dy_selected_col);
dy_data[i] = Dy[dy_selected_row * (dy_step>>2) + dy_selected_col];
data[0][i] = dx_data[i] * dx_data[i];
data[1][i] = dx_data[i] * dy_data[i];
data[2][i] = dy_data[i] * dy_data[i];
}
#endif
float sum0 = 0.0, sum1 = 0.0, sum2 = 0.0;
for(int i=1; i < ksY; i++)
{
sum0 += (data[0][i]);
sum1 += (data[1][i]);
sum2 += (data[2][i]);
}
float sum01,sum02,sum11,sum12,sum21,sum22;
sum01 = sum0 + (data[0][0]);
sum02 = sum0 + (data[0][ksY]);
temp[0][col] = sum01;
temp[1][col] = sum02;
sum11 = sum1 + (data[1][0]);
sum12 = sum1 + (data[1][ksY]);
temp[2][col] = sum11;
temp[3][col] = sum12;
sum21 = sum2 + (data[2][0]);
sum22 = sum2 + (data[2][ksY]);
temp[4][col] = sum21;
temp[5][col] = sum22;
barrier(CLK_LOCAL_MEM_FENCE);
if(col < (THREADS-(ksX-1)))
{
col += anX;
int posX = dst_startX - dst_x_off + col - anX;
int posY = (gly << 1);
int till = (ksX + 1)%2;
float tmp_sum[6]={ 0.0, 0.0 , 0.0, 0.0, 0.0, 0.0 };
for(int k=0; k<6; k++)
for(int i=-anX; i<=anX - till; i++)
{
tmp_sum[k] += temp[k][col+i];
}
if(posX < dst_cols && (posY) < dst_rows)
{
float a = tmp_sum[0] * 0.5f;
float b = tmp_sum[2];
float c = tmp_sum[4] * 0.5f;
dst[(dst_startY+0) * (dst_step>>2)+ dst_startX + col - anX] = (float)((a+c) - sqrt((a-c)*(a-c) + b*b));
}
if(posX < dst_cols && (posY + 1) < dst_rows)
{
float a = tmp_sum[1] * 0.5f;
float b = tmp_sum[3];
float c = tmp_sum[5] * 0.5f;
dst[(dst_startY+1) * (dst_step>>2)+ dst_startX + col - anX] = (float)((a+c) - sqrt((a-c)*(a-c) + b*b));
}
}
}

@ -0,0 +1,980 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Sen Liu, swjtuls1987@126.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if defined (DOUBLE_SUPPORT)
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#elif defined (cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#endif
typedef double T;
typedef double F;
typedef double4 F4;
#define convert_F4 convert_double4
#else
typedef float F;
typedef float4 F4;
typedef long T;
#define convert_F4 convert_float4
#endif
#define DST_ROW_00 0
#define DST_ROW_10 1
#define DST_ROW_01 2
#define DST_ROW_20 3
#define DST_ROW_11 4
#define DST_ROW_02 5
#define DST_ROW_30 6
#define DST_ROW_21 7
#define DST_ROW_12 8
#define DST_ROW_03 9
__kernel void icvContourMoments(int contour_total,
__global float* reader_oclmat_data,
__global T* dst_a,
int dst_step)
{
T xi_1, yi_1, xi_12, yi_12, xi, yi, xi2, yi2, dxy, xii_1, yii_1;
int idx = get_global_id(0);
if (idx < 0 || idx >= contour_total)
return;
xi_1 = (T)(*(reader_oclmat_data + (get_global_id(0) << 1)));
yi_1 = (T)(*(reader_oclmat_data + (get_global_id(0) << 1) + 1));
xi_12 = xi_1 * xi_1;
yi_12 = yi_1 * yi_1;
if(idx == contour_total - 1)
{
xi = (T)(*(reader_oclmat_data));
yi = (T)(*(reader_oclmat_data + 1));
}
else
{
xi = (T)(*(reader_oclmat_data + (idx + 1) * 2));
yi = (T)(*(reader_oclmat_data + (idx + 1) * 2 + 1));
}
xi2 = xi * xi;
yi2 = yi * yi;
dxy = xi_1 * yi - xi * yi_1;
xii_1 = xi_1 + xi;
yii_1 = yi_1 + yi;
dst_step /= sizeof(T);
*( dst_a + DST_ROW_00 * dst_step + idx) = dxy;
*( dst_a + DST_ROW_10 * dst_step + idx) = dxy * xii_1;
*( dst_a + DST_ROW_01 * dst_step + idx) = dxy * yii_1;
*( dst_a + DST_ROW_20 * dst_step + idx) = dxy * (xi_1 * xii_1 + xi2);
*( dst_a + DST_ROW_11 * dst_step + idx) = dxy * (xi_1 * (yii_1 + yi_1) + xi * (yii_1 + yi));
*( dst_a + DST_ROW_02 * dst_step + idx) = dxy * (yi_1 * yii_1 + yi2);
*( dst_a + DST_ROW_30 * dst_step + idx) = dxy * xii_1 * (xi_12 + xi2);
*( dst_a + DST_ROW_03 * dst_step + idx) = dxy * yii_1 * (yi_12 + yi2);
*( dst_a + DST_ROW_21 * dst_step + idx) =
dxy * (xi_12 * (3 * yi_1 + yi) + 2 * xi * xi_1 * yii_1 +
xi2 * (yi_1 + 3 * yi));
*( dst_a + DST_ROW_12 * dst_step + idx) =
dxy * (yi_12 * (3 * xi_1 + xi) + 2 * yi * yi_1 * xii_1 +
yi2 * (xi_1 + 3 * xi));
}
__kernel void dst_sum(int src_rows, int src_cols, int tile_height, int tile_width, int TILE_SIZE,
__global F* sum, __global F* dst_m, int dst_step)
{
int gidy = get_global_id(0);
int gidx = get_global_id(1);
int block_y = src_rows/tile_height;
int block_x = src_cols/tile_width;
int block_num;
if(src_rows > TILE_SIZE && src_rows % TILE_SIZE != 0)
block_y ++;
if(src_cols > TILE_SIZE && src_cols % TILE_SIZE != 0)
block_x ++;
block_num = block_y * block_x;
__local F dst_sum[10][128];
if(gidy<128-block_num)
for(int i=0; i<10; i++)
dst_sum[i][gidy+block_num]=0;
barrier(CLK_LOCAL_MEM_FENCE);
dst_step /= sizeof(F);
if(gidy<block_num)
{
dst_sum[0][gidy] = *(dst_m + mad24(DST_ROW_00 * block_y, dst_step, gidy));
dst_sum[1][gidy] = *(dst_m + mad24(DST_ROW_10 * block_y, dst_step, gidy));
dst_sum[2][gidy] = *(dst_m + mad24(DST_ROW_01 * block_y, dst_step, gidy));
dst_sum[3][gidy] = *(dst_m + mad24(DST_ROW_20 * block_y, dst_step, gidy));
dst_sum[4][gidy] = *(dst_m + mad24(DST_ROW_11 * block_y, dst_step, gidy));
dst_sum[5][gidy] = *(dst_m + mad24(DST_ROW_02 * block_y, dst_step, gidy));
dst_sum[6][gidy] = *(dst_m + mad24(DST_ROW_30 * block_y, dst_step, gidy));
dst_sum[7][gidy] = *(dst_m + mad24(DST_ROW_21 * block_y, dst_step, gidy));
dst_sum[8][gidy] = *(dst_m + mad24(DST_ROW_12 * block_y, dst_step, gidy));
dst_sum[9][gidy] = *(dst_m + mad24(DST_ROW_03 * block_y, dst_step, gidy));
}
barrier(CLK_LOCAL_MEM_FENCE);
for(int lsize=64; lsize>0; lsize>>=1)
{
if(gidy<lsize)
{
int lsize2 = gidy + lsize;
for(int i=0; i<10; i++)
dst_sum[i][gidy] += dst_sum[i][lsize2];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if(gidy==0)
for(int i=0; i<10; i++)
sum[i] = dst_sum[i][0];
}
__kernel void CvMoments_D0(__global uchar16* src_data, int src_rows, int src_cols, int src_step,
__global F* dst_m,
int dst_cols, int dst_step, int blocky,
int depth, int cn, int coi, int binary, int TILE_SIZE)
{
uchar tmp_coi[16]; // get the coi data
uchar16 tmp[16];
int VLEN_C = 16; // vector length of uchar
int gidy = get_global_id(0);
int gidx = get_global_id(1);
int wgidy = get_group_id(0);
int wgidx = get_group_id(1);
int lidy = get_local_id(0);
int lidx = get_local_id(1);
int y = wgidy*TILE_SIZE; // vector length of uchar
int x = wgidx*TILE_SIZE; // vector length of uchar
int kcn = (cn==2)?2:4;
int rstep = min(src_step, TILE_SIZE);
int tileSize_height = min(TILE_SIZE, src_rows - y);
int tileSize_width = min(TILE_SIZE, src_cols - x);
if ( y+lidy < src_rows )
{
if( tileSize_width < TILE_SIZE )
for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ )
*((__global uchar*)src_data+(y+lidy)*src_step+x+i) = 0;
if( coi > 0 ) //channel of interest
for(int i = 0; i < tileSize_width; i += VLEN_C)
{
for(int j=0; j<VLEN_C; j++)
tmp_coi[j] = *((__global uchar*)src_data+(y+lidy)*src_step+(x+i+j)*kcn+coi-1);
tmp[i/VLEN_C] = (uchar16)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7],
tmp_coi[8],tmp_coi[9],tmp_coi[10],tmp_coi[11],tmp_coi[12],tmp_coi[13],tmp_coi[14],tmp_coi[15]);
}
else
for(int i=0; i < tileSize_width; i+=VLEN_C)
tmp[i/VLEN_C] = *(src_data+(y+lidy)*src_step/VLEN_C+(x+i)/VLEN_C);
}
uchar16 zero = (uchar16)(0);
uchar16 full = (uchar16)(255);
if( binary )
for(int i=0; i < tileSize_width; i+=VLEN_C)
tmp[i/VLEN_C] = (tmp[i/VLEN_C]!=zero)?full:zero;
F mom[10];
__local int m[10][128];
if(lidy < 128)
{
for(int i=0; i<10; i++)
m[i][lidy]=0;
}
barrier(CLK_LOCAL_MEM_FENCE);
int lm[10] = {0};
int16 x0 = (int16)(0);
int16 x1 = (int16)(0);
int16 x2 = (int16)(0);
int16 x3 = (int16)(0);
for( int xt = 0 ; xt < tileSize_width; xt+=(VLEN_C) )
{
int16 v_xt = (int16)(xt, xt+1, xt+2, xt+3, xt+4, xt+5, xt+6, xt+7, xt+8, xt+9, xt+10, xt+11, xt+12, xt+13, xt+14, xt+15);
int16 p = convert_int16(tmp[xt/VLEN_C]);
int16 xp = v_xt * p, xxp = xp *v_xt;
x0 += p;
x1 += xp;
x2 += xxp;
x3 += xxp * v_xt;
}
x0.s0 += x0.s1 + x0.s2 + x0.s3 + x0.s4 + x0.s5 + x0.s6 + x0.s7 + x0.s8 + x0.s9 + x0.sa + x0.sb + x0.sc + x0.sd + x0.se + x0.sf;
x1.s0 += x1.s1 + x1.s2 + x1.s3 + x1.s4 + x1.s5 + x1.s6 + x1.s7 + x1.s8 + x1.s9 + x1.sa + x1.sb + x1.sc + x1.sd + x1.se + x1.sf;
x2.s0 += x2.s1 + x2.s2 + x2.s3 + x2.s4 + x2.s5 + x2.s6 + x2.s7 + x2.s8 + x2.s9 + x2.sa + x2.sb + x2.sc + x2.sd + x2.se + x2.sf;
x3.s0 += x3.s1 + x3.s2 + x3.s3 + x3.s4 + x3.s5 + x3.s6 + x3.s7 + x3.s8 + x3.s9 + x3.sa + x3.sb + x3.sc + x3.sd + x3.se + x3.sf;
int py = lidy * ((int)x0.s0);
int sy = lidy*lidy;
int bheight = min(tileSize_height, TILE_SIZE/2);
if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
{
m[9][lidy-bheight] = ((int)py) * sy; // m03
m[8][lidy-bheight] = ((int)x1.s0) * sy; // m12
m[7][lidy-bheight] = ((int)x2.s0) * lidy; // m21
m[6][lidy-bheight] = x3.s0; // m30
m[5][lidy-bheight] = x0.s0 * sy; // m02
m[4][lidy-bheight] = x1.s0 * lidy; // m11
m[3][lidy-bheight] = x2.s0; // m20
m[2][lidy-bheight] = py; // m01
m[1][lidy-bheight] = x1.s0; // m10
m[0][lidy-bheight] = x0.s0; // m00
}
else if(lidy < bheight)
{
lm[9] = ((int)py) * sy; // m03
lm[8] = ((int)x1.s0) * sy; // m12
lm[7] = ((int)x2.s0) * lidy; // m21
lm[6] = x3.s0; // m30
lm[5] = x0.s0 * sy; // m02
lm[4] = x1.s0 * lidy; // m11
lm[3] = x2.s0; // m20
lm[2] = py; // m01
lm[1] = x1.s0; // m10
lm[0] = x0.s0; // m00
}
barrier(CLK_LOCAL_MEM_FENCE);
for( int j = bheight; j >= 1; j = j/2 )
{
if(lidy < j)
for( int i = 0; i < 10; i++ )
lm[i] = lm[i] + m[i][lidy];
barrier(CLK_LOCAL_MEM_FENCE);
if(lidy >= j/2&&lidy < j)
for( int i = 0; i < 10; i++ )
m[i][lidy-j/2] = lm[i];
barrier(CLK_LOCAL_MEM_FENCE);
}
if(lidy == 0&&lidx == 0)
{
for( int mt = 0; mt < 10; mt++ )
mom[mt] = (F)lm[mt];
if(binary)
{
F s = 1./255;
for( int mt = 0; mt < 10; mt++ )
mom[mt] *= s;
}
F xm = x * mom[0], ym = y * mom[0];
// accumulate moments computed in each tile
dst_step /= sizeof(F);
// + m00 ( = m00' )
*(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0];
// + m10 ( = m10' + x*m00' )
*(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm;
// + m01 ( = m01' + y*m00' )
*(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym;
// + m20 ( = m20' + 2*x*m10' + x*x*m00' )
*(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm);
// + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
*(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1];
// + m02 ( = m02' + 2*y*m01' + y*y*m00' )
*(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym);
// + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
*(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
// + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
*(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
// + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
*(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
// + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
*(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
}
}
__kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_cols, int src_step,
__global F* dst_m,
int dst_cols, int dst_step, int blocky,
int depth, int cn, int coi, int binary, const int TILE_SIZE)
{
ushort tmp_coi[8]; // get the coi data
ushort8 tmp[32];
int VLEN_US = 8; // vector length of ushort
int gidy = get_global_id(0);
int gidx = get_global_id(1);
int wgidy = get_group_id(0);
int wgidx = get_group_id(1);
int lidy = get_local_id(0);
int lidx = get_local_id(1);
int y = wgidy*TILE_SIZE; // real Y index of pixel
int x = wgidx*TILE_SIZE; // real X index of pixel
int kcn = (cn==2)?2:4;
int rstep = min(src_step/2, TILE_SIZE);
int tileSize_height = min(TILE_SIZE, src_rows - y);
int tileSize_width = min(TILE_SIZE, src_cols -x);
if ( y+lidy < src_rows )
{
if(src_cols > TILE_SIZE && tileSize_width < TILE_SIZE)
for(int i=tileSize_width; i < rstep && (x+i) < src_cols; i++ )
*((__global ushort*)src_data+(y+lidy)*src_step/2+x+i) = 0;
if( coi > 0 )
for(int i=0; i < tileSize_width; i+=VLEN_US)
{
for(int j=0; j<VLEN_US; j++)
tmp_coi[j] = *((__global ushort*)src_data+(y+lidy)*(int)src_step/2+(x+i+j)*kcn+coi-1);
tmp[i/VLEN_US] = (ushort8)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7]);
}
else
for(int i=0; i < tileSize_width; i+=VLEN_US)
tmp[i/VLEN_US] = *(src_data+(y+lidy)*src_step/(2*VLEN_US)+(x+i)/VLEN_US);
}
ushort8 zero = (ushort8)(0);
ushort8 full = (ushort8)(255);
if( binary )
for(int i=0; i < tileSize_width; i+=VLEN_US)
tmp[i/VLEN_US] = (tmp[i/VLEN_US]!=zero)?full:zero;
F mom[10];
__local long m[10][128];
if(lidy < 128)
for(int i=0; i<10; i++)
m[i][lidy]=0;
barrier(CLK_LOCAL_MEM_FENCE);
long lm[10] = {0};
int8 x0 = (int8)(0);
int8 x1 = (int8)(0);
int8 x2 = (int8)(0);
long8 x3 = (long8)(0);
for( int xt = 0 ; xt < tileSize_width; xt+=(VLEN_US) )
{
int8 v_xt = (int8)(xt, xt+1, xt+2, xt+3, xt+4, xt+5, xt+6, xt+7);
int8 p = convert_int8(tmp[xt/VLEN_US]);
int8 xp = v_xt * p, xxp = xp * v_xt;
x0 += p;
x1 += xp;
x2 += xxp;
x3 += convert_long8(xxp) *convert_long8(v_xt);
}
x0.s0 += x0.s1 + x0.s2 + x0.s3 + x0.s4 + x0.s5 + x0.s6 + x0.s7;
x1.s0 += x1.s1 + x1.s2 + x1.s3 + x1.s4 + x1.s5 + x1.s6 + x1.s7;
x2.s0 += x2.s1 + x2.s2 + x2.s3 + x2.s4 + x2.s5 + x2.s6 + x2.s7;
x3.s0 += x3.s1 + x3.s2 + x3.s3 + x3.s4 + x3.s5 + x3.s6 + x3.s7;
int py = lidy * x0.s0, sy = lidy*lidy;
int bheight = min(tileSize_height, TILE_SIZE/2);
if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
{
m[9][lidy-bheight] = ((long)py) * sy; // m03
m[8][lidy-bheight] = ((long)x1.s0) * sy; // m12
m[7][lidy-bheight] = ((long)x2.s0) * lidy; // m21
m[6][lidy-bheight] = x3.s0; // m30
m[5][lidy-bheight] = x0.s0 * sy; // m02
m[4][lidy-bheight] = x1.s0 * lidy; // m11
m[3][lidy-bheight] = x2.s0; // m20
m[2][lidy-bheight] = py; // m01
m[1][lidy-bheight] = x1.s0; // m10
m[0][lidy-bheight] = x0.s0; // m00
}
else if(lidy < bheight)
{
lm[9] = ((long)py) * sy; // m03
lm[8] = ((long)x1.s0) * sy; // m12
lm[7] = ((long)x2.s0) * lidy; // m21
lm[6] = x3.s0; // m30
lm[5] = x0.s0 * sy; // m02
lm[4] = x1.s0 * lidy; // m11
lm[3] = x2.s0; // m20
lm[2] = py; // m01
lm[1] = x1.s0; // m10
lm[0] = x0.s0; // m00
}
barrier(CLK_LOCAL_MEM_FENCE);
for( int j = TILE_SIZE/2; j >= 1; j = j/2 )
{
if(lidy < j)
for( int i = 0; i < 10; i++ )
lm[i] = lm[i] + m[i][lidy];
}
barrier(CLK_LOCAL_MEM_FENCE);
for( int j = TILE_SIZE/2; j >= 1; j = j/2 )
{
if(lidy >= j/2&&lidy < j)
for( int i = 0; i < 10; i++ )
m[i][lidy-j/2] = lm[i];
}
barrier(CLK_LOCAL_MEM_FENCE);
if(lidy == 0&&lidx == 0)
{
for(int mt = 0; mt < 10; mt++ )
mom[mt] = (F)lm[mt];
if(binary)
{
F s = 1./255;
for( int mt = 0; mt < 10; mt++ )
mom[mt] *= s;
}
F xm = x *mom[0], ym = y * mom[0];
// accumulate moments computed in each tile
dst_step /= sizeof(F);
// + m00 ( = m00' )
*(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0];
// + m10 ( = m10' + x*m00' )
*(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm;
// + m01 ( = m01' + y*m00' )
*(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym;
// + m20 ( = m20' + 2*x*m10' + x*x*m00' )
*(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm);
// + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
*(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1];
// + m02 ( = m02' + 2*y*m01' + y*y*m00' )
*(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym);
// + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
*(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
// + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
*(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
// + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
*(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
// + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
*(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
}
}
__kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols, int src_step,
__global F* dst_m,
int dst_cols, int dst_step, int blocky,
int depth, int cn, int coi, int binary, const int TILE_SIZE)
{
short tmp_coi[8]; // get the coi data
short8 tmp[32];
int VLEN_S =8; // vector length of short
int gidy = get_global_id(0);
int gidx = get_global_id(1);
int wgidy = get_group_id(0);
int wgidx = get_group_id(1);
int lidy = get_local_id(0);
int lidx = get_local_id(1);
int y = wgidy*TILE_SIZE; // real Y index of pixel
int x = wgidx*TILE_SIZE; // real X index of pixel
int kcn = (cn==2)?2:4;
int rstep = min(src_step/2, TILE_SIZE);
int tileSize_height = min(TILE_SIZE, src_rows - y);
int tileSize_width = min(TILE_SIZE, src_cols -x);
if ( y+lidy < src_rows )
{
if(tileSize_width < TILE_SIZE)
for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ )
*((__global short*)src_data+(y+lidy)*src_step/2+x+i) = 0;
if( coi > 0 )
for(int i=0; i < tileSize_width; i+=VLEN_S)
{
for(int j=0; j<VLEN_S; j++)
tmp_coi[j] = *((__global short*)src_data+(y+lidy)*src_step/2+(x+i+j)*kcn+coi-1);
tmp[i/VLEN_S] = (short8)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7]);
}
else
for(int i=0; i < tileSize_width; i+=VLEN_S)
tmp[i/VLEN_S] = *(src_data+(y+lidy)*src_step/(2*VLEN_S)+(x+i)/VLEN_S);
}
short8 zero = (short8)(0);
short8 full = (short8)(255);
if( binary )
for(int i=0; i < tileSize_width; i+=(VLEN_S))
tmp[i/VLEN_S] = (tmp[i/VLEN_S]!=zero)?full:zero;
F mom[10];
__local long m[10][128];
if(lidy < 128)
for(int i=0; i<10; i++)
m[i][lidy]=0;
barrier(CLK_LOCAL_MEM_FENCE);
long lm[10] = {0};
int8 x0 = (int8)(0);
int8 x1 = (int8)(0);
int8 x2 = (int8)(0);
long8 x3 = (long8)(0);
for( int xt = 0 ; xt < tileSize_width; xt+= (VLEN_S))
{
int8 v_xt = (int8)(xt, xt+1, xt+2, xt+3, xt+4, xt+5, xt+6, xt+7);
int8 p = convert_int8(tmp[xt/VLEN_S]);
int8 xp = v_xt * p, xxp = xp * v_xt;
x0 += p;
x1 += xp;
x2 += xxp;
x3 += convert_long8(xxp) * convert_long8(v_xt);
}
x0.s0 += x0.s1 + x0.s2 + x0.s3 + x0.s4 + x0.s5 + x0.s6 + x0.s7;
x1.s0 += x1.s1 + x1.s2 + x1.s3 + x1.s4 + x1.s5 + x1.s6 + x1.s7;
x2.s0 += x2.s1 + x2.s2 + x2.s3 + x2.s4 + x2.s5 + x2.s6 + x2.s7;
x3.s0 += x3.s1 + x3.s2 + x3.s3 + x3.s4 + x3.s5 + x3.s6 + x3.s7;
int py = lidy * x0.s0, sy = lidy*lidy;
int bheight = min(tileSize_height, TILE_SIZE/2);
if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
{
m[9][lidy-bheight] = ((long)py) * sy; // m03
m[8][lidy-bheight] = ((long)x1.s0) * sy; // m12
m[7][lidy-bheight] = ((long)x2.s0) * lidy; // m21
m[6][lidy-bheight] = x3.s0; // m30
m[5][lidy-bheight] = x0.s0 * sy; // m02
m[4][lidy-bheight] = x1.s0 * lidy; // m11
m[3][lidy-bheight] = x2.s0; // m20
m[2][lidy-bheight] = py; // m01
m[1][lidy-bheight] = x1.s0; // m10
m[0][lidy-bheight] = x0.s0; // m00
}
else if(lidy < bheight)
{
lm[9] = ((long)py) * sy; // m03
lm[8] = ((long)(x1.s0)) * sy; // m12
lm[7] = ((long)(x2.s0)) * lidy; // m21
lm[6] = x3.s0; // m30
lm[5] = x0.s0 * sy; // m02
lm[4] = x1.s0 * lidy; // m11
lm[3] = x2.s0; // m20
lm[2] = py; // m01
lm[1] = x1.s0; // m10
lm[0] = x0.s0; // m00
}
barrier(CLK_LOCAL_MEM_FENCE);
for( int j = TILE_SIZE/2; j >=1; j = j/2 )
{
if(lidy < j)
for( int i = 0; i < 10; i++ )
lm[i] = lm[i] + m[i][lidy];
barrier(CLK_LOCAL_MEM_FENCE);
if(lidy >= j/2&&lidy < j)
for( int i = 0; i < 10; i++ )
m[i][lidy-j/2] = lm[i];
barrier(CLK_LOCAL_MEM_FENCE);
}
if(lidy ==0 &&lidx ==0)
{
for(int mt = 0; mt < 10; mt++ )
mom[mt] = (F)lm[mt];
if(binary)
{
F s = 1./255;
for( int mt = 0; mt < 10; mt++ )
mom[mt] *= s;
}
F xm = x * mom[0], ym = y*mom[0];
// accumulate moments computed in each tile
dst_step /= sizeof(F);
// + m00 ( = m00' )
*(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0];
// + m10 ( = m10' + x*m00' )
*(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm;
// + m01 ( = m01' + y*m00' )
*(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym;
// + m20 ( = m20' + 2*x*m10' + x*x*m00' )
*(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm);
// + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
*(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1];
// + m02 ( = m02' + 2*y*m01' + y*y*m00' )
*(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym);
// + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
*(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
// + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
*(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
// + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
*(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
// + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
*(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
}
}
__kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols, int src_step,
__global F* dst_m,
int dst_cols, int dst_step, int blocky,
int depth, int cn, int coi, int binary, const int TILE_SIZE)
{
float tmp_coi[4]; // get the coi data
float4 tmp[64] ;
int VLEN_F = 4; // vector length of float
int gidy = get_global_id(0);
int gidx = get_global_id(1);
int wgidy = get_group_id(0);
int wgidx = get_group_id(1);
int lidy = get_local_id(0);
int lidx = get_local_id(1);
int y = wgidy*TILE_SIZE; // real Y index of pixel
int x = wgidx*TILE_SIZE; // real X index of pixel
int kcn = (cn==2)?2:4;
int rstep = min(src_step/4, TILE_SIZE);
int tileSize_height = min(TILE_SIZE, src_rows - y);
int tileSize_width = min(TILE_SIZE, src_cols -x);
int maxIdx = mul24(src_rows, src_cols);
int yOff = (y+lidy)*src_step;
int index;
if ( y+lidy < src_rows )
{
if(tileSize_width < TILE_SIZE)
for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ )
*((__global float*)src_data+(y+lidy)*src_step/4+x+i) = 0;
if( coi > 0 )
for(int i=0; i < tileSize_width; i+=VLEN_F)
{
for(int j=0; j<4; j++)
tmp_coi[j] = *(src_data+(y+lidy)*src_step/4+(x+i+j)*kcn+coi-1);
tmp[i/VLEN_F] = (float4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]);
}
else
for(int i=0; i < tileSize_width; i+=VLEN_F)
tmp[i/VLEN_F] = (float4)(*(src_data+(y+lidy)*src_step/4+x+i),*(src_data+(y+lidy)*src_step/4+x+i+1),*(src_data+(y+lidy)*src_step/4+x+i+2),*(src_data+(y+lidy)*src_step/4+x+i+3));
}
float4 zero = (float4)(0);
float4 full = (float4)(255);
if( binary )
for(int i=0; i < tileSize_width; i+=4)
tmp[i/VLEN_F] = (tmp[i/VLEN_F]!=zero)?full:zero;
F mom[10];
__local F m[10][128];
if(lidy < 128)
for(int i = 0; i < 10; i ++)
m[i][lidy] = 0;
barrier(CLK_LOCAL_MEM_FENCE);
F lm[10] = {0};
F4 x0 = (F4)(0);
F4 x1 = (F4)(0);
F4 x2 = (F4)(0);
F4 x3 = (F4)(0);
for( int xt = 0 ; xt < tileSize_width; xt+=VLEN_F )
{
F4 v_xt = (F4)(xt, xt+1, xt+2, xt+3);
F4 p = convert_F4(tmp[xt/VLEN_F]);
F4 xp = v_xt * p, xxp = xp * v_xt;
x0 += p;
x1 += xp;
x2 += xxp;
x3 += xxp * v_xt;
}
x0.s0 += x0.s1 + x0.s2 + x0.s3;
x1.s0 += x1.s1 + x1.s2 + x1.s3;
x2.s0 += x2.s1 + x2.s2 + x2.s3;
x3.s0 += x3.s1 + x3.s2 + x3.s3;
F py = lidy * x0.s0, sy = lidy*lidy;
int bheight = min(tileSize_height, TILE_SIZE/2);
if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
{
m[9][lidy-bheight] = ((F)py) * sy; // m03
m[8][lidy-bheight] = ((F)x1.s0) * sy; // m12
m[7][lidy-bheight] = ((F)x2.s0) * lidy; // m21
m[6][lidy-bheight] = x3.s0; // m30
m[5][lidy-bheight] = x0.s0 * sy; // m02
m[4][lidy-bheight] = x1.s0 * lidy; // m11
m[3][lidy-bheight] = x2.s0; // m20
m[2][lidy-bheight] = py; // m01
m[1][lidy-bheight] = x1.s0; // m10
m[0][lidy-bheight] = x0.s0; // m00
}
else if(lidy < bheight)
{
lm[9] = ((F)py) * sy; // m03
lm[8] = ((F)x1.s0) * sy; // m12
lm[7] = ((F)x2.s0) * lidy; // m21
lm[6] = x3.s0; // m30
lm[5] = x0.s0 * sy; // m02
lm[4] = x1.s0 * lidy; // m11
lm[3] = x2.s0; // m20
lm[2] = py; // m01
lm[1] = x1.s0; // m10
lm[0] = x0.s0; // m00
}
barrier(CLK_LOCAL_MEM_FENCE);
for( int j = TILE_SIZE/2; j >= 1; j = j/2 )
{
if(lidy < j)
for( int i = 0; i < 10; i++ )
lm[i] = lm[i] + m[i][lidy];
barrier(CLK_LOCAL_MEM_FENCE);
if(lidy >= j/2&&lidy < j)
for( int i = 0; i < 10; i++ )
m[i][lidy-j/2] = lm[i];
barrier(CLK_LOCAL_MEM_FENCE);
}
if(lidy == 0&&lidx == 0)
{
for( int mt = 0; mt < 10; mt++ )
mom[mt] = (F)lm[mt];
if(binary)
{
F s = 1./255;
for( int mt = 0; mt < 10; mt++ )
mom[mt] *= s;
}
F xm = x * mom[0], ym = y * mom[0];
// accumulate moments computed in each tile
dst_step /= sizeof(F);
// + m00 ( = m00' )
*(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0];
// + m10 ( = m10' + x*m00' )
*(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm;
// + m01 ( = m01' + y*m00' )
*(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym;
// + m20 ( = m20' + 2*x*m10' + x*x*m00' )
*(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm);
// + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
*(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1];
// + m02 ( = m02' + 2*y*m01' + y*y*m00' )
*(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym);
// + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
*(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
// + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
*(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
// + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
*(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
// + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
*(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
}
}
__kernel void CvMoments_D6(__global F* src_data, int src_rows, int src_cols, int src_step,
__global F* dst_m,
int dst_cols, int dst_step, int blocky,
int depth, int cn, int coi, int binary, const int TILE_SIZE)
{
F tmp_coi[4]; // get the coi data
F4 tmp[64];
int VLEN_D = 4; // length of vetor
int gidy = get_global_id(0);
int gidx = get_global_id(1);
int wgidy = get_group_id(0);
int wgidx = get_group_id(1);
int lidy = get_local_id(0);
int lidx = get_local_id(1);
int y = wgidy*TILE_SIZE; // real Y index of pixel
int x = wgidx*TILE_SIZE; // real X index of pixel
int kcn = (cn==2)?2:4;
int rstep = min(src_step/8, TILE_SIZE);
int tileSize_height = min(TILE_SIZE, src_rows - y);
int tileSize_width = min(TILE_SIZE, src_cols - x);
if ( y+lidy < src_rows )
{
if(tileSize_width < TILE_SIZE)
for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ )
*((__global F*)src_data+(y+lidy)*src_step/8+x+i) = 0;
if( coi > 0 )
for(int i=0; i < tileSize_width; i+=VLEN_D)
{
for(int j=0; j<4 && ((x+i+j)*kcn+coi-1)<src_cols; j++)
tmp_coi[j] = *(src_data+(y+lidy)*src_step/8+(x+i+j)*kcn+coi-1);
tmp[i/VLEN_D] = (F4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]);
}
else
for(int i=0; i < tileSize_width && (x+i+3) < src_cols; i+=VLEN_D)
tmp[i/VLEN_D] = (F4)(*(src_data+(y+lidy)*src_step/8+x+i),*(src_data+(y+lidy)*src_step/8+x+i+1),*(src_data+(y+lidy)*src_step/8+x+i+2),*(src_data+(y+lidy)*src_step/8+x+i+3));
}
F4 zero = (F4)(0);
F4 full = (F4)(255);
if( binary )
for(int i=0; i < tileSize_width; i+=VLEN_D)
tmp[i/VLEN_D] = (tmp[i/VLEN_D]!=zero)?full:zero;
F mom[10];
__local F m[10][128];
if(lidy < 128)
for(int i=0; i<10; i++)
m[i][lidy]=0;
barrier(CLK_LOCAL_MEM_FENCE);
F lm[10] = {0};
F4 x0 = (F4)(0);
F4 x1 = (F4)(0);
F4 x2 = (F4)(0);
F4 x3 = (F4)(0);
for( int xt = 0 ; xt < tileSize_width; xt+=VLEN_D )
{
F4 v_xt = (F4)(xt, xt+1, xt+2, xt+3);
F4 p = tmp[xt/VLEN_D];
F4 xp = v_xt * p, xxp = xp * v_xt;
x0 += p;
x1 += xp;
x2 += xxp;
x3 += xxp *v_xt;
}
x0.s0 += x0.s1 + x0.s2 + x0.s3;
x1.s0 += x1.s1 + x1.s2 + x1.s3;
x2.s0 += x2.s1 + x2.s2 + x2.s3;
x3.s0 += x3.s1 + x3.s2 + x3.s3;
F py = lidy * x0.s0, sy = lidy*lidy;
int bheight = min(tileSize_height, TILE_SIZE/2);
if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
{
m[9][lidy-bheight] = ((F)py) * sy; // m03
m[8][lidy-bheight] = ((F)x1.s0) * sy; // m12
m[7][lidy-bheight] = ((F)x2.s0) * lidy; // m21
m[6][lidy-bheight] = x3.s0; // m30
m[5][lidy-bheight] = x0.s0 * sy; // m02
m[4][lidy-bheight] = x1.s0 * lidy; // m11
m[3][lidy-bheight] = x2.s0; // m20
m[2][lidy-bheight] = py; // m01
m[1][lidy-bheight] = x1.s0; // m10
m[0][lidy-bheight] = x0.s0; // m00
}
else if(lidy < bheight)
{
lm[9] = ((F)py) * sy; // m03
lm[8] = ((F)x1.s0) * sy; // m12
lm[7] = ((F)x2.s0) * lidy; // m21
lm[6] = x3.s0; // m30
lm[5] = x0.s0 * sy; // m02
lm[4] = x1.s0 * lidy; // m11
lm[3] = x2.s0; // m20
lm[2] = py; // m01
lm[1] = x1.s0; // m10
lm[0] = x0.s0; // m00
}
barrier(CLK_LOCAL_MEM_FENCE);
for( int j = TILE_SIZE/2; j >= 1; j = j/2 )
{
if(lidy < j)
for( int i = 0; i < 10; i++ )
lm[i] = lm[i] + m[i][lidy];
barrier(CLK_LOCAL_MEM_FENCE);
if(lidy >= j/2&&lidy < j)
for( int i = 0; i < 10; i++ )
m[i][lidy-j/2] = lm[i];
barrier(CLK_LOCAL_MEM_FENCE);
}
if(lidy == 0&&lidx == 0)
{
for( int mt = 0; mt < 10; mt++ )
mom[mt] = (F)lm[mt];
if(binary)
{
F s = 1./255;
for( int mt = 0; mt < 10; mt++ )
mom[mt] *= s;
}
F xm = x * mom[0], ym = y * mom[0];
// accumulate moments computed in each tile
dst_step /= sizeof(F);
// + m00 ( = m00' )
*(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0];
// + m10 ( = m10' + x*m00' )
*(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm;
// + m01 ( = m01' + y*m00' )
*(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym;
// + m20 ( = m20' + 2*x*m10' + x*x*m00' )
*(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm);
// + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
*(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1];
// + m02 ( = m02' + 2*y*m01' + y*y*m00' )
*(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym);
// + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
*(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
// + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
*(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
// + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
*(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
// + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
*(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
}
}

@ -0,0 +1,228 @@
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Niko Li, newlife20080214@gmail.com
// Zero Lin, zero.lin@amd.com
// Yao Wang, bitwangyaoyao@gmail.com
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//
#ifdef ERODE
#define MORPH_OP(A,B) min((A),(B))
#endif
#ifdef DILATE
#define MORPH_OP(A,B) max((A),(B))
#endif
//BORDER_CONSTANT: iiiiii|abcdefgh|iiiiiii
#define ELEM(i,l_edge,r_edge,elem1,elem2) (i)<(l_edge) | (i) >= (r_edge) ? (elem1) : (elem2)
#ifndef GENTYPE
__kernel void morph_C1_D0(__global const uchar * restrict src,
__global uchar *dst,
int src_offset_x, int src_offset_y,
int cols, int rows,
int src_step_in_pixel, int dst_step_in_pixel,
__constant uchar * mat_kernel,
int src_whole_cols, int src_whole_rows,
int dst_offset_in_pixel)
{
int l_x = get_local_id(0);
int l_y = get_local_id(1);
int x = get_group_id(0)*4*LSIZE0;
int y = get_group_id(1)*LSIZE1;
int start_x = x+src_offset_x-RADIUSX & 0xfffffffc;
int end_x = x + src_offset_x+LSIZE0*4+RADIUSX & 0xfffffffc;
int width = (end_x -start_x+4)>>2;
int offset = src_offset_x-RADIUSX & 3;
int start_y = y+src_offset_y-RADIUSY;
int point1 = mad24(l_y,LSIZE0,l_x);
int point2 = point1 + LSIZE0*LSIZE1;
int tl_x = (point1 % width)<<2;
int tl_y = point1 / width;
int tl_x2 = (point2 % width)<<2;
int tl_y2 = point2 / width;
int cur_x = start_x + tl_x;
int cur_y = start_y + tl_y;
int cur_x2 = start_x + tl_x2;
int cur_y2 = start_y + tl_y2;
int start_addr = mad24(cur_y,src_step_in_pixel,cur_x);
int start_addr2 = mad24(cur_y2,src_step_in_pixel,cur_x2);
uchar4 temp0,temp1;
__local uchar4 LDS_DAT[2*LSIZE1*LSIZE0];
int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
//read pixels from src
start_addr = ((start_addr < end_addr) && (start_addr > 0)) ? start_addr : 0;
start_addr2 = ((start_addr2 < end_addr) && (start_addr2 > 0)) ? start_addr2 : 0;
temp0 = *(__global uchar4*)&src[start_addr];
temp1 = *(__global uchar4*)&src[start_addr2];
//judge if read out of boundary
temp0.x= ELEM(cur_x,0,src_whole_cols,VAL,temp0.x);
temp0.y= ELEM(cur_x+1,0,src_whole_cols,VAL,temp0.y);
temp0.z= ELEM(cur_x+2,0,src_whole_cols,VAL,temp0.z);
temp0.w= ELEM(cur_x+3,0,src_whole_cols,VAL,temp0.w);
temp0= ELEM(cur_y,0,src_whole_rows,(uchar4)VAL,temp0);
temp1.x= ELEM(cur_x2,0,src_whole_cols,VAL,temp1.x);
temp1.y= ELEM(cur_x2+1,0,src_whole_cols,VAL,temp1.y);
temp1.z= ELEM(cur_x2+2,0,src_whole_cols,VAL,temp1.z);
temp1.w= ELEM(cur_x2+3,0,src_whole_cols,VAL,temp1.w);
temp1= ELEM(cur_y2,0,src_whole_rows,(uchar4)VAL,temp1);
LDS_DAT[point1] = temp0;
LDS_DAT[point2] = temp1;
barrier(CLK_LOCAL_MEM_FENCE);
uchar4 res = (uchar4)VAL;
for(int i=0; i<2*RADIUSY+1; i++)
for(int j=0; j<2*RADIUSX+1; j++)
{
res =
#ifndef RECTKERNEL
mat_kernel[i*(2*RADIUSX+1)+j] ?
#endif
MORPH_OP(res,vload4(0,(__local uchar*)&LDS_DAT[mad24((l_y+i),width,l_x)]+offset+j))
#ifndef RECTKERNEL
:res
#endif
;
}
int gidx = get_global_id(0)<<2;
int gidy = get_global_id(1);
int out_addr = mad24(gidy,dst_step_in_pixel,gidx+dst_offset_in_pixel);
if(gidx+3<cols && gidy<rows && ((dst_offset_in_pixel&3)==0))
{
*(__global uchar4*)&dst[out_addr] = res;
}
else
{
if(gidx+3<cols && gidy<rows)
{
dst[out_addr] = res.x;
dst[out_addr+1] = res.y;
dst[out_addr+2] = res.z;
dst[out_addr+3] = res.w;
}
else if(gidx+2<cols && gidy<rows)
{
dst[out_addr] = res.x;
dst[out_addr+1] = res.y;
dst[out_addr+2] = res.z;
}
else if(gidx+1<cols && gidy<rows)
{
dst[out_addr] = res.x;
dst[out_addr+1] = res.y;
}
else if(gidx<cols && gidy<rows)
{
dst[out_addr] = res.x;
}
}
}
#else
__kernel void morph(__global const GENTYPE * restrict src,
__global GENTYPE *dst,
int src_offset_x, int src_offset_y,
int cols, int rows,
int src_step_in_pixel, int dst_step_in_pixel,
__constant uchar * mat_kernel,
int src_whole_cols, int src_whole_rows,
int dst_offset_in_pixel)
{
int l_x = get_local_id(0);
int l_y = get_local_id(1);
int x = get_group_id(0)*LSIZE0;
int y = get_group_id(1)*LSIZE1;
int start_x = x+src_offset_x-RADIUSX;
int end_x = x + src_offset_x+LSIZE0+RADIUSX;
int width = end_x -(x+src_offset_x-RADIUSX)+1;
int start_y = y+src_offset_y-RADIUSY;
int point1 = mad24(l_y,LSIZE0,l_x);
int point2 = point1 + LSIZE0*LSIZE1;
int tl_x = point1 % width;
int tl_y = point1 / width;
int tl_x2 = point2 % width;
int tl_y2 = point2 / width;
int cur_x = start_x + tl_x;
int cur_y = start_y + tl_y;
int cur_x2 = start_x + tl_x2;
int cur_y2 = start_y + tl_y2;
int start_addr = mad24(cur_y,src_step_in_pixel,cur_x);
int start_addr2 = mad24(cur_y2,src_step_in_pixel,cur_x2);
GENTYPE temp0,temp1;
__local GENTYPE LDS_DAT[2*LSIZE1*LSIZE0];
int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
//read pixels from src
start_addr = ((start_addr < end_addr) && (start_addr > 0)) ? start_addr : 0;
start_addr2 = ((start_addr2 < end_addr) && (start_addr2 > 0)) ? start_addr2 : 0;
temp0 = src[start_addr];
temp1 = src[start_addr2];
//judge if read out of boundary
temp0= ELEM(cur_x,0,src_whole_cols,(GENTYPE)VAL,temp0);
temp0= ELEM(cur_y,0,src_whole_rows,(GENTYPE)VAL,temp0);
temp1= ELEM(cur_x2,0,src_whole_cols,(GENTYPE)VAL,temp1);
temp1= ELEM(cur_y2,0,src_whole_rows,(GENTYPE)VAL,temp1);
LDS_DAT[point1] = temp0;
LDS_DAT[point2] = temp1;
barrier(CLK_LOCAL_MEM_FENCE);
GENTYPE res = (GENTYPE)VAL;
for(int i=0; i<2*RADIUSY+1; i++)
for(int j=0; j<2*RADIUSX+1; j++)
{
res =
#ifndef RECTKERNEL
mat_kernel[i*(2*RADIUSX+1)+j] ?
#endif
MORPH_OP(res,LDS_DAT[mad24(l_y+i,width,l_x+j)])
#ifndef RECTKERNEL
:res
#endif
;
}
int gidx = get_global_id(0);
int gidy = get_global_id(1);
int out_addr = mad24(gidy,dst_step_in_pixel,gidx+dst_offset_in_pixel);
if(gidx<cols && gidy<rows)
{
dst[out_addr] = res;
}
}
#endif

File diff suppressed because it is too large Load Diff

@ -0,0 +1,323 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Wu Zailong, bullet@yeah.net
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if defined (DOUBLE_SUPPORT)
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#elif defined (cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#endif
#endif
#ifdef INTER_NEAREST
#define convertToWT
#endif
#ifdef BORDER_CONSTANT
#define EXTRAPOLATE(v2, v) v = scalar;
#elif defined BORDER_REPLICATE
#define EXTRAPOLATE(v2, v) \
{ \
v2 = max(min(v2, (int2)(src_cols - 1, src_rows - 1)), zero); \
v = convertToWT(src[mad24(v2.y, src_step, v2.x + src_offset)]); \
}
#elif defined BORDER_WRAP
#define EXTRAPOLATE(v2, v) \
{ \
if (v2.x < 0) \
v2.x -= ((v2.x - src_cols + 1) / src_cols) * src_cols; \
if (v2.x >= src_cols) \
v2.x %= src_cols; \
\
if (v2.y < 0) \
v2.y -= ((v2.y - src_rows + 1) / src_rows) * src_rows; \
if( v2.y >= src_rows ) \
v2.y %= src_rows; \
v = convertToWT(src[mad24(v2.y, src_step, v2.x + src_offset)]); \
}
#elif defined(BORDER_REFLECT) || defined(BORDER_REFLECT_101)
#ifdef BORDER_REFLECT
#define DELTA int delta = 0
#else
#define DELTA int delta = 1
#endif
#define EXTRAPOLATE(v2, v) \
{ \
DELTA; \
if (src_cols == 1) \
v2.x = 0; \
else \
do \
{ \
if( v2.x < 0 ) \
v2.x = -v2.x - 1 + delta; \
else \
v2.x = src_cols - 1 - (v2.x - src_cols) - delta; \
} \
while (v2.x >= src_cols || v2.x < 0); \
\
if (src_rows == 1) \
v2.y = 0; \
else \
do \
{ \
if( v2.y < 0 ) \
v2.y = -v2.y - 1 + delta; \
else \
v2.y = src_rows - 1 - (v2.y - src_rows) - delta; \
} \
while (v2.y >= src_rows || v2.y < 0); \
v = convertToWT(src[mad24(v2.y, src_step, v2.x + src_offset)]); \
}
#else
#error No extrapolation method
#endif
#define NEED_EXTRAPOLATION(gx, gy) (gx >= src_cols || gy >= src_rows || gx < 0 || gy < 0)
#ifdef INTER_NEAREST
__kernel void remap_2_32FC1(__global const T * restrict src, __global T * dst,
__global float * map1, __global float * map2,
int src_offset, int dst_offset, int map1_offset, int map2_offset,
int src_step, int dst_step, int map1_step, int map2_step,
int src_cols, int src_rows, int dst_cols, int dst_rows, T scalar)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < dst_cols && y < dst_rows)
{
int dstIdx = mad24(y, dst_step, x + dst_offset);
int map1Idx = mad24(y, map1_step, x + map1_offset);
int map2Idx = mad24(y, map2_step, x + map2_offset);
int gx = convert_int_sat_rte(map1[map1Idx]);
int gy = convert_int_sat_rte(map2[map2Idx]);
if (NEED_EXTRAPOLATION(gx, gy))
{
int2 gxy = (int2)(gx, gy), zero = (int2)(0);
EXTRAPOLATE(gxy, dst[dstIdx]);
}
else
{
int srcIdx = mad24(gy, src_step, gx + src_offset);
dst[dstIdx] = src[srcIdx];
}
}
}
__kernel void remap_32FC2(__global const T * restrict src, __global T * dst, __global float2 * map1,
int src_offset, int dst_offset, int map1_offset,
int src_step, int dst_step, int map1_step,
int src_cols, int src_rows, int dst_cols, int dst_rows, T scalar)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < dst_cols && y < dst_rows)
{
int dstIdx = mad24(y, dst_step, x + dst_offset);
int map1Idx = mad24(y, map1_step, x + map1_offset);
int2 gxy = convert_int2_sat_rte(map1[map1Idx]);
int gx = gxy.x, gy = gxy.y;
if (NEED_EXTRAPOLATION(gx, gy))
{
int2 zero = (int2)(0);
EXTRAPOLATE(gxy, dst[dstIdx]);
}
else
{
int srcIdx = mad24(gy, src_step, gx + src_offset);
dst[dstIdx] = src[srcIdx];
}
}
}
__kernel void remap_16SC2(__global const T * restrict src, __global T * dst, __global short2 * map1,
int src_offset, int dst_offset, int map1_offset,
int src_step, int dst_step, int map1_step,
int src_cols, int src_rows, int dst_cols, int dst_rows, T scalar)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < dst_cols && y < dst_rows)
{
int dstIdx = mad24(y, dst_step, x + dst_offset);
int map1Idx = mad24(y, map1_step, x + map1_offset);
int2 gxy = convert_int2(map1[map1Idx]);
int gx = gxy.x, gy = gxy.y;
if (NEED_EXTRAPOLATION(gx, gy))
{
int2 zero = (int2)(0);
EXTRAPOLATE(gxy, dst[dstIdx]);
}
else
{
int srcIdx = mad24(gy, src_step, gx + src_offset);
dst[dstIdx] = src[srcIdx];
}
}
}
#elif INTER_LINEAR
__kernel void remap_2_32FC1(__global T const * restrict src, __global T * dst,
__global float * map1, __global float * map2,
int src_offset, int dst_offset, int map1_offset, int map2_offset,
int src_step, int dst_step, int map1_step, int map2_step,
int src_cols, int src_rows, int dst_cols, int dst_rows, T nVal)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < dst_cols && y < dst_rows)
{
int dstIdx = mad24(y, dst_step, x + dst_offset);
int map1Idx = mad24(y, map1_step, x + map1_offset);
int map2Idx = mad24(y, map2_step, x + map2_offset);
float2 map_data = (float2)(map1[map1Idx], map2[map2Idx]);
int2 map_dataA = convert_int2_sat_rtn(map_data);
int2 map_dataB = (int2)(map_dataA.x + 1, map_dataA.y);
int2 map_dataC = (int2)(map_dataA.x, map_dataA.y + 1);
int2 map_dataD = (int2)(map_dataA.x + 1, map_dataA.y +1);
int2 zero = (int2)(0);
float2 _u = map_data - convert_float2(map_dataA);
WT2 u = convertToWT2(convert_int2_rte(convertToWT2(_u) * (WT2)32)) / (WT2)32;
WT scalar = convertToWT(nVal);
WT a = scalar, b = scalar, c = scalar, d = scalar;
if (!NEED_EXTRAPOLATION(map_dataA.x, map_dataA.y))
a = convertToWT(src[mad24(map_dataA.y, src_step, map_dataA.x + src_offset)]);
else
EXTRAPOLATE(map_dataA, a);
if (!NEED_EXTRAPOLATION(map_dataB.x, map_dataB.y))
b = convertToWT(src[mad24(map_dataB.y, src_step, map_dataB.x + src_offset)]);
else
EXTRAPOLATE(map_dataB, b);
if (!NEED_EXTRAPOLATION(map_dataC.x, map_dataC.y))
c = convertToWT(src[mad24(map_dataC.y, src_step, map_dataC.x + src_offset)]);
else
EXTRAPOLATE(map_dataC, c);
if (!NEED_EXTRAPOLATION(map_dataD.x, map_dataD.y))
d = convertToWT(src[mad24(map_dataD.y, src_step, map_dataD.x + src_offset)]);
else
EXTRAPOLATE(map_dataD, d);
WT dst_data = a * (WT)(1 - u.x) * (WT)(1 - u.y) +
b * (WT)(u.x) * (WT)(1 - u.y) +
c * (WT)(1 - u.x) * (WT)(u.y) +
d * (WT)(u.x) * (WT)(u.y);
dst[dstIdx] = convertToT(dst_data);
}
}
__kernel void remap_32FC2(__global T const * restrict src, __global T * dst,
__global float2 * map1,
int src_offset, int dst_offset, int map1_offset,
int src_step, int dst_step, int map1_step,
int src_cols, int src_rows, int dst_cols, int dst_rows, T nVal)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < dst_cols && y < dst_rows)
{
int dstIdx = mad24(y, dst_step, x + dst_offset);
int map1Idx = mad24(y, map1_step, x + map1_offset);
float2 map_data = map1[map1Idx];
int2 map_dataA = convert_int2_sat_rtn(map_data);
int2 map_dataB = (int2)(map_dataA.x + 1, map_dataA.y);
int2 map_dataC = (int2)(map_dataA.x, map_dataA.y + 1);
int2 map_dataD = (int2)(map_dataA.x + 1, map_dataA.y + 1);
int2 zero = (int2)(0);
float2 _u = map_data - convert_float2(map_dataA);
WT2 u = convertToWT2(convert_int2_rte(convertToWT2(_u) * (WT2)32)) / (WT2)32;
WT scalar = convertToWT(nVal);
WT a = scalar, b = scalar, c = scalar, d = scalar;
if (!NEED_EXTRAPOLATION(map_dataA.x, map_dataA.y))
a = convertToWT(src[mad24(map_dataA.y, src_step, map_dataA.x + src_offset)]);
else
EXTRAPOLATE(map_dataA, a);
if (!NEED_EXTRAPOLATION(map_dataB.x, map_dataB.y))
b = convertToWT(src[mad24(map_dataB.y, src_step, map_dataB.x + src_offset)]);
else
EXTRAPOLATE(map_dataB, b);
if (!NEED_EXTRAPOLATION(map_dataC.x, map_dataC.y))
c = convertToWT(src[mad24(map_dataC.y, src_step, map_dataC.x + src_offset)]);
else
EXTRAPOLATE(map_dataC, c);
if (!NEED_EXTRAPOLATION(map_dataD.x, map_dataD.y))
d = convertToWT(src[mad24(map_dataD.y, src_step, map_dataD.x + src_offset)]);
else
EXTRAPOLATE(map_dataD, d);
WT dst_data = a * (WT)(1 - u.x) * (WT)(1 - u.y) +
b * (WT)(u.x) * (WT)(1 - u.y) +
c * (WT)(1 - u.x) * (WT)(u.y) +
d * (WT)(u.x) * (WT)(u.y);
dst[dstIdx] = convertToT(dst_data);
}
}
#endif

@ -0,0 +1,152 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Zhang Ying, zhangying913@gmail.com
// Niko Li, newlife20080214@gmail.com
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
// resize kernel
// Currently, CV_8UC1 CV_8UC4 CV_32FC1 and CV_32FC4are supported.
// We shall support other types later if necessary.
#if defined DOUBLE_SUPPORT
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#define F double
#else
#define F float
#endif
#define INTER_RESIZE_COEF_BITS 11
#define INTER_RESIZE_COEF_SCALE (1 << INTER_RESIZE_COEF_BITS)
#define CAST_BITS (INTER_RESIZE_COEF_BITS << 1)
#define CAST_SCALE (1.0f/(1<<CAST_BITS))
#define INC(x,l) min(x+1,l-1)
#define PIXSIZE ((int)sizeof(PIXTYPE))
#define noconvert(x) (x)
#if defined INTER_LINEAR
__kernel void resizeLN(__global const uchar* srcptr, int srcstep, int srcoffset,
int srcrows, int srccols,
__global uchar* dstptr, int dststep, int dstoffset,
int dstrows, int dstcols,
float ifx, float ify)
{
int dx = get_global_id(0);
int dy = get_global_id(1);
float sx = ((dx+0.5f) * ifx - 0.5f), sy = ((dy+0.5f) * ify - 0.5f);
int x = floor(sx), y = floor(sy);
float u = sx - x, v = sy - y;
if ( x<0 ) x=0,u=0;
if ( x>=srccols ) x=srccols-1,u=0;
if ( y<0 ) y=0,v=0;
if ( y>=srcrows ) y=srcrows-1,v=0;
int y_ = INC(y,srcrows);
int x_ = INC(x,srccols);
const PIXTYPE* src = (const PIXTYPE*)(srcptr + mad24(y, srcstep, srcoffset + x*PIXSIZE));
#if depth == 0
u = u * INTER_RESIZE_COEF_SCALE;
v = v * INTER_RESIZE_COEF_SCALE;
int U = rint(u);
int V = rint(v);
int U1 = rint(INTER_RESIZE_COEF_SCALE - u);
int V1 = rint(INTER_RESIZE_COEF_SCALE - v);
WORKTYPE data0 = convertToWT(*(const PIXTYPE*)(srcptr + mad24(y, srcstep, srcoffset + x*PIXSIZE)));
WORKTYPE data1 = convertToWT(*(const PIXTYPE*)(srcptr + mad24(y, srcstep, srcoffset + x_*PIXSIZE)));
WORKTYPE data2 = convertToWT(*(const PIXTYPE*)(srcptr + mad24(y_, srcstep, srcoffset + x*PIXSIZE)));
WORKTYPE data3 = convertToWT(*(const PIXTYPE*)(srcptr + mad24(y_, srcstep, srcoffset + x_*PIXSIZE)));
WORKTYPE val = mul24((WORKTYPE)mul24(U1, V1), data0) + mul24((WORKTYPE)mul24(U, V1), data1) +
mul24((WORKTYPE)mul24(U1, V), data2) + mul24((WORKTYPE)mul24(U, V), data3);
PIXTYPE uval = convertToDT((val + (1<<(CAST_BITS-1)))>>CAST_BITS);
#else
float u1 = 1.f-u;
float v1 = 1.f-v;
WORKTYPE data0 = convertToWT(*(const PIXTYPE*)(srcptr + mad24(y, srcstep, srcoffset + x*PIXSIZE)));
WORKTYPE data1 = convertToWT(*(const PIXTYPE*)(srcptr + mad24(y, srcstep, srcoffset + x_*PIXSIZE)));
WORKTYPE data2 = convertToWT(*(const PIXTYPE*)(srcptr + mad24(y_, srcstep, srcoffset + x*PIXSIZE)));
WORKTYPE data3 = convertToWT(*(const PIXTYPE*)(srcptr + mad24(y_, srcstep, srcoffset + x_*PIXSIZE)));
PIXTYPE uval = u1 * v1 * s_data1 + u * v1 * s_data2 + u1 * v *s_data3 + u * v *s_data4;
#endif
if(dx < dstcols && dy < dstrows)
{
PIXTYPE* dst = (PIXTYPE*)(dstptr + mad24(dy, dststep, dstoffset + dx*PIXSIZE));
dst[0] = uval;
}
}
#elif defined INTER_NEAREST
__kernel void resizeNN(__global const uchar* srcptr, int srcstep, int srcoffset,
int srcrows, int srccols,
__global uchar* dstptr, int dststep, int dstoffset,
int dstrows, int dstcols,
float ifx, float ify)
{
int dx = get_global_id(0);
int dy = get_global_id(1);
if( dx < dstcols && dy < dstrows )
{
F s1 = dx*ifx;
F s2 = dy*ify;
int sx = min(convert_int_rtz(s1), srccols-1);
int sy = min(convert_int_rtz(s2), srcrows-1);
PIXTYPE* dst = (PIXTYPE*)(dstptr +
mad24(dy, dststep, dstoffset + dx*PIXSIZE));
const PIXTYPE* src = (const PIXTYPE*)(srcptr +
mad24(sy, srcstep, srcoffset + sx*PIXSIZE));
dst[0] = src[0];
}
}
#endif

@ -0,0 +1,152 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Zhang Ying, zhangying913@gmail.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if defined (DOUBLE_SUPPORT)
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#endif
// threshold type:
// enum { THRESH_BINARY=0, THRESH_BINARY_INV=1, THRESH_TRUNC=2, THRESH_TOZERO=3,
// THRESH_TOZERO_INV=4, THRESH_MASK=7, THRESH_OTSU=8 };
__kernel void threshold_C1_D0(__global const uchar * restrict src, __global uchar *dst,
int src_offset, int src_step,
int dst_offset, int dst_rows, int dst_cols, int dst_step,
uchar thresh, uchar max_val, int thresh_type
)
{
int gx = get_global_id(0);
const int gy = get_global_id(1);
int offset = (dst_offset & 15);
src_offset -= offset;
int dstart = (gx << 4) - offset;
if(dstart < dst_cols && gy < dst_rows)
{
uchar16 sdata = vload16(gx, src+src_offset+gy*src_step);
uchar16 ddata;
uchar16 zero = 0;
switch (thresh_type)
{
case 0:
ddata = ((sdata > thresh) ) ? (uchar16)(max_val) : (uchar16)(0);
break;
case 1:
ddata = ((sdata > thresh)) ? zero : (uchar16)(max_val);
break;
case 2:
ddata = ((sdata > thresh)) ? (uchar16)(thresh) : sdata;
break;
case 3:
ddata = ((sdata > thresh)) ? sdata : zero;
break;
case 4:
ddata = ((sdata > thresh)) ? zero : sdata;
break;
default:
ddata = sdata;
}
int16 dpos = (int16)(dstart, dstart+1, dstart+2, dstart+3, dstart+4, dstart+5, dstart+6, dstart+7, dstart+8,
dstart+9, dstart+10, dstart+11, dstart+12, dstart+13, dstart+14, dstart+15);
uchar16 dVal = *(__global uchar16*)(dst+dst_offset+gy*dst_step+dstart);
int16 con = dpos >= 0 && dpos < dst_cols;
ddata = convert_uchar16(con != 0) ? ddata : dVal;
if(dstart < dst_cols)
{
*(__global uchar16*)(dst+dst_offset+gy*dst_step+dstart) = ddata;
}
}
}
__kernel void threshold_C1_D5(__global const float * restrict src, __global float *dst,
int src_offset, int src_step,
int dst_offset, int dst_rows, int dst_cols, int dst_step,
float thresh, float max_val, int thresh_type
)
{
const int gx = get_global_id(0);
const int gy = get_global_id(1);
int offset = (dst_offset & 3);
src_offset -= offset;
int dstart = (gx << 2) - offset;
if(dstart < dst_cols && gy < dst_rows)
{
float4 sdata = vload4(gx, src+src_offset+gy*src_step);
float4 ddata;
float4 zero = 0;
switch (thresh_type)
{
case 0:
ddata = sdata > thresh ? (float4)(max_val) : (float4)(0.f);
break;
case 1:
ddata = sdata > thresh ? zero : (float4)max_val;
break;
case 2:
ddata = sdata > thresh ? (float4)thresh : sdata;
break;
case 3:
ddata = sdata > thresh ? sdata : (float4)(0.f);
break;
case 4:
ddata = sdata > thresh ? (float4)(0.f) : sdata;
break;
default:
ddata = sdata;
}
int4 dpos = (int4)(dstart, dstart+1, dstart+2, dstart+3);
float4 dVal = *(__global float4*)(dst+dst_offset+gy*dst_step+dstart);
int4 con = dpos >= 0 && dpos < dst_cols;
ddata = convert_float4(con) != (float4)(0) ? ddata : dVal;
if(dstart < dst_cols)
{
*(__global float4*)(dst+dst_offset+gy*dst_step+dstart) = ddata;
}
}
}

@ -0,0 +1,761 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Zhang Ying, zhangying913@gmail.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
//warpAffine kernel
//support data types: CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4, and three interpolation methods: NN, Linear, Cubic.
#if defined (DOUBLE_SUPPORT)
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#elif defined (cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#endif
typedef double F;
typedef double4 F4;
#define convert_F4 convert_double4
#else
typedef float F;
typedef float4 F4;
#define convert_F4 convert_float4
#endif
#define INTER_BITS 5
#define INTER_TAB_SIZE (1 << INTER_BITS)
#define INTER_SCALE 1.f/INTER_TAB_SIZE
#define AB_BITS max(10, (int)INTER_BITS)
#define AB_SCALE (1 << AB_BITS)
#define INTER_REMAP_COEF_BITS 15
#define INTER_REMAP_COEF_SCALE (1 << INTER_REMAP_COEF_BITS)
inline void interpolateCubic( float x, float* coeffs )
{
const float A = -0.75f;
coeffs[0] = ((A*(x + 1.f) - 5.0f*A)*(x + 1.f) + 8.0f*A)*(x + 1.f) - 4.0f*A;
coeffs[1] = ((A + 2.f)*x - (A + 3.f))*x*x + 1.f;
coeffs[2] = ((A + 2.f)*(1.f - x) - (A + 3.f))*(1.f - x)*(1.f - x) + 1.f;
coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2];
}
/**********************************************8UC1*********************************************
***********************************************************************************************/
__kernel void warpAffineNN_C1_D0(__global uchar const * restrict src, __global uchar * dst, int src_cols, int src_rows,
int dst_cols, int dst_rows, int srcStep, int dstStep,
int src_offset, int dst_offset, __constant F * M, int threadCols )
{
int dx = get_global_id(0);
int dy = get_global_id(1);
if( dx < threadCols && dy < dst_rows)
{
dx = (dx<<2) - (dst_offset&3);
int round_delta = (AB_SCALE>>1);
int4 X, Y;
int4 sx, sy;
int4 DX = (int4)(dx, dx+1, dx+2, dx+3);
DX = (DX << AB_BITS);
F4 M0DX, M3DX;
M0DX = M[0] * convert_F4(DX);
M3DX = M[3] * convert_F4(DX);
X = convert_int4(rint(M0DX));
Y = convert_int4(rint(M3DX));
int tmp1, tmp2;
tmp1 = rint((M[1]*dy + M[2]) * AB_SCALE);
tmp2 = rint((M[4]*dy + M[5]) * AB_SCALE);
X += tmp1 + round_delta;
Y += tmp2 + round_delta;
sx = convert_int4(convert_short4(X >> AB_BITS));
sy = convert_int4(convert_short4(Y >> AB_BITS));
__global uchar4 * d = (__global uchar4 *)(dst+dst_offset+dy*dstStep+dx);
uchar4 dval = *d;
DX = (int4)(dx, dx+1, dx+2, dx+3);
int4 dcon = DX >= 0 && DX < dst_cols && dy >= 0 && dy < dst_rows;
int4 scon = sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows;
int4 spos = src_offset + sy * srcStep + sx;
uchar4 sval;
sval.s0 = scon.s0 ? src[spos.s0] : 0;
sval.s1 = scon.s1 ? src[spos.s1] : 0;
sval.s2 = scon.s2 ? src[spos.s2] : 0;
sval.s3 = scon.s3 ? src[spos.s3] : 0;
dval = convert_uchar4(dcon) != (uchar4)(0,0,0,0) ? sval : dval;
*d = dval;
}
}
__kernel void warpAffineLinear_C1_D0(__global const uchar * restrict src, __global uchar * dst, int src_cols, int src_rows,
int dst_cols, int dst_rows, int srcStep, int dstStep,
int src_offset, int dst_offset, __constant F * M, int threadCols )
{
int dx = get_global_id(0);
int dy = get_global_id(1);
if( dx < threadCols && dy < dst_rows)
{
dx = (dx<<2) - (dst_offset&3);
int round_delta = ((AB_SCALE >> INTER_BITS) >> 1);
int4 X, Y;
short4 ax, ay;
int4 sx, sy;
int4 DX = (int4)(dx, dx+1, dx+2, dx+3);
DX = (DX << AB_BITS);
F4 M0DX, M3DX;
M0DX = M[0] * convert_F4(DX);
M3DX = M[3] * convert_F4(DX);
X = convert_int4(rint(M0DX));
Y = convert_int4(rint(M3DX));
int tmp1, tmp2;
tmp1 = rint((M[1]*dy + M[2]) * AB_SCALE);
tmp2 = rint((M[4]*dy + M[5]) * AB_SCALE);
X += tmp1 + round_delta;
Y += tmp2 + round_delta;
X = X >> (AB_BITS - INTER_BITS);
Y = Y >> (AB_BITS - INTER_BITS);
sx = convert_int4(convert_short4(X >> INTER_BITS));
sy = convert_int4(convert_short4(Y >> INTER_BITS));
ax = convert_short4(X & (INTER_TAB_SIZE-1));
ay = convert_short4(Y & (INTER_TAB_SIZE-1));
uchar4 v0, v1, v2,v3;
int4 scon0, scon1, scon2, scon3;
int4 spos0, spos1, spos2, spos3;
scon0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows);
scon1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows);
scon2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows);
scon3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows);
spos0 = src_offset + sy * srcStep + sx;
spos1 = src_offset + sy * srcStep + sx + 1;
spos2 = src_offset + (sy+1) * srcStep + sx;
spos3 = src_offset + (sy+1) * srcStep + sx + 1;
v0.s0 = scon0.s0 ? src[spos0.s0] : 0;
v1.s0 = scon1.s0 ? src[spos1.s0] : 0;
v2.s0 = scon2.s0 ? src[spos2.s0] : 0;
v3.s0 = scon3.s0 ? src[spos3.s0] : 0;
v0.s1 = scon0.s1 ? src[spos0.s1] : 0;
v1.s1 = scon1.s1 ? src[spos1.s1] : 0;
v2.s1 = scon2.s1 ? src[spos2.s1] : 0;
v3.s1 = scon3.s1 ? src[spos3.s1] : 0;
v0.s2 = scon0.s2 ? src[spos0.s2] : 0;
v1.s2 = scon1.s2 ? src[spos1.s2] : 0;
v2.s2 = scon2.s2 ? src[spos2.s2] : 0;
v3.s2 = scon3.s2 ? src[spos3.s2] : 0;
v0.s3 = scon0.s3 ? src[spos0.s3] : 0;
v1.s3 = scon1.s3 ? src[spos1.s3] : 0;
v2.s3 = scon2.s3 ? src[spos2.s3] : 0;
v3.s3 = scon3.s3 ? src[spos3.s3] : 0;
short4 itab0, itab1, itab2, itab3;
float4 taby, tabx;
taby = INTER_SCALE * convert_float4(ay);
tabx = INTER_SCALE * convert_float4(ax);
itab0 = convert_short4_sat(( (1.0f-taby)*(1.0f-tabx) * (float4)INTER_REMAP_COEF_SCALE ));
itab1 = convert_short4_sat(( (1.0f-taby)*tabx * (float4)INTER_REMAP_COEF_SCALE ));
itab2 = convert_short4_sat(( taby*(1.0f-tabx) * (float4)INTER_REMAP_COEF_SCALE ));
itab3 = convert_short4_sat(( taby*tabx * (float4)INTER_REMAP_COEF_SCALE ));
int4 val;
uchar4 tval;
val = convert_int4(v0) * convert_int4(itab0) + convert_int4(v1) * convert_int4(itab1)
+ convert_int4(v2) * convert_int4(itab2) + convert_int4(v3) * convert_int4(itab3);
tval = convert_uchar4_sat ( (val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ;
__global uchar4 * d =(__global uchar4 *)(dst+dst_offset+dy*dstStep+dx);
uchar4 dval = *d;
DX = (int4)(dx, dx+1, dx+2, dx+3);
int4 dcon = DX >= 0 && DX < dst_cols && dy >= 0 && dy < dst_rows;
dval = convert_uchar4(dcon != 0) ? tval : dval;
*d = dval;
}
}
__kernel void warpAffineCubic_C1_D0(__global uchar * src, __global uchar * dst, int src_cols, int src_rows,
int dst_cols, int dst_rows, int srcStep, int dstStep,
int src_offset, int dst_offset, __constant F * M, int threadCols )
{
int dx = get_global_id(0);
int dy = get_global_id(1);
if( dx < threadCols && dy < dst_rows)
{
int round_delta = ((AB_SCALE>>INTER_BITS)>>1);
int X0 = rint(M[0] * dx * AB_SCALE);
int Y0 = rint(M[3] * dx * AB_SCALE);
X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta;
int X = X0 >> (AB_BITS - INTER_BITS);
int Y = Y0 >> (AB_BITS - INTER_BITS);
short sx = (short)(X >> INTER_BITS) - 1;
short sy = (short)(Y >> INTER_BITS) - 1;
short ay = (short)(Y & (INTER_TAB_SIZE-1));
short ax = (short)(X & (INTER_TAB_SIZE-1));
uchar v[16];
int i, j;
#pragma unroll 4
for(i=0; i<4; i++)
for(j=0; j<4; j++)
{
v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? src[src_offset+(sy+i) * srcStep + (sx+j)] : 0;
}
short itab[16];
float tab1y[4], tab1x[4];
float axx, ayy;
ayy = 1.f/INTER_TAB_SIZE * ay;
axx = 1.f/INTER_TAB_SIZE * ax;
interpolateCubic(ayy, tab1y);
interpolateCubic(axx, tab1x);
int isum = 0;
#pragma unroll 16
for( i=0; i<16; i++ )
{
F v = tab1y[(i>>2)] * tab1x[(i&3)];
isum += itab[i] = convert_short_sat( rint( v * INTER_REMAP_COEF_SCALE ) );
}
if( isum != INTER_REMAP_COEF_SCALE )
{
int k1, k2;
int diff = isum - INTER_REMAP_COEF_SCALE;
int Mk1=2, Mk2=2, mk1=2, mk2=2;
for( k1 = 2; k1 < 4; k1++ )
for( k2 = 2; k2 < 4; k2++ )
{
if( itab[(k1<<2)+k2] < itab[(mk1<<2)+mk2] )
mk1 = k1, mk2 = k2;
else if( itab[(k1<<2)+k2] > itab[(Mk1<<2)+Mk2] )
Mk1 = k1, Mk2 = k2;
}
diff<0 ? (itab[(Mk1<<2)+Mk2]=(short)(itab[(Mk1<<2)+Mk2]-diff)) : (itab[(mk1<<2)+mk2]=(short)(itab[(mk1<<2)+mk2]-diff));
}
if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
{
int sum=0;
for ( i =0; i<16; i++ )
{
sum += v[i] * itab[i] ;
}
dst[dst_offset+dy*dstStep+dx] = convert_uchar_sat( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ;
}
}
}
/**********************************************8UC4*********************************************
***********************************************************************************************/
__kernel void warpAffineNN_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, int src_cols, int src_rows,
int dst_cols, int dst_rows, int srcStep, int dstStep,
int src_offset, int dst_offset, __constant F * M, int threadCols )
{
int dx = get_global_id(0);
int dy = get_global_id(1);
if( dx < threadCols && dy < dst_rows)
{
int round_delta = (AB_SCALE >> 1);
int X0 = rint(M[0] * dx * AB_SCALE);
int Y0 = rint(M[3] * dx * AB_SCALE);
X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta;
int sx0 = (short)(X0 >> AB_BITS);
int sy0 = (short)(Y0 >> AB_BITS);
if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
dst[(dst_offset>>2)+dy*(dstStep>>2)+dx]= (sx0>=0 && sx0<src_cols && sy0>=0 && sy0<src_rows) ? src[(src_offset>>2)+sy0*(srcStep>>2)+sx0] : (uchar4)0;
}
}
__kernel void warpAffineLinear_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, int src_cols, int src_rows,
int dst_cols, int dst_rows, int srcStep, int dstStep,
int src_offset, int dst_offset, __constant F * M, int threadCols )
{
int dx = get_global_id(0);
int dy = get_global_id(1);
if( dx < threadCols && dy < dst_rows)
{
int round_delta = AB_SCALE/INTER_TAB_SIZE/2;
src_offset = (src_offset>>2);
srcStep = (srcStep>>2);
int tmp = (dx << AB_BITS);
int X0 = rint(M[0] * tmp);
int Y0 = rint(M[3] * tmp);
X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta;
X0 = X0 >> (AB_BITS - INTER_BITS);
Y0 = Y0 >> (AB_BITS - INTER_BITS);
short sx0 = (short)(X0 >> INTER_BITS);
short sy0 = (short)(Y0 >> INTER_BITS);
short ax0 = (short)(X0 & (INTER_TAB_SIZE-1));
short ay0 = (short)(Y0 & (INTER_TAB_SIZE-1));
int4 v0, v1, v2, v3;
v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? convert_int4(src[src_offset+sy0 * srcStep + sx0]) : 0;
v1 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0 >= 0 && sy0 < src_rows) ? convert_int4(src[src_offset+sy0 * srcStep + sx0+1]) : 0;
v2 = (sx0 >= 0 && sx0 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? convert_int4(src[src_offset+(sy0+1) * srcStep + sx0]) : 0;
v3 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? convert_int4(src[src_offset+(sy0+1) * srcStep + sx0+1]) : 0;
int itab0, itab1, itab2, itab3;
float taby, tabx;
taby = 1.f/INTER_TAB_SIZE*ay0;
tabx = 1.f/INTER_TAB_SIZE*ax0;
itab0 = convert_short_sat(rint( (1.0f-taby)*(1.0f-tabx) * INTER_REMAP_COEF_SCALE ));
itab1 = convert_short_sat(rint( (1.0f-taby)*tabx * INTER_REMAP_COEF_SCALE ));
itab2 = convert_short_sat(rint( taby*(1.0f-tabx) * INTER_REMAP_COEF_SCALE ));
itab3 = convert_short_sat(rint( taby*tabx * INTER_REMAP_COEF_SCALE ));
int4 val;
val = v0 * itab0 + v1 * itab1 + v2 * itab2 + v3 * itab3;
if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
dst[(dst_offset>>2)+dy*(dstStep>>2)+dx] = convert_uchar4_sat ( (val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ;
}
}
__kernel void warpAffineCubic_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, int src_cols, int src_rows,
int dst_cols, int dst_rows, int srcStep, int dstStep,
int src_offset, int dst_offset, __constant F * M, int threadCols )
{
int dx = get_global_id(0);
int dy = get_global_id(1);
if( dx < threadCols && dy < dst_rows)
{
int round_delta = ((AB_SCALE>>INTER_BITS)>>1);
src_offset = (src_offset>>2);
srcStep = (srcStep>>2);
dst_offset = (dst_offset>>2);
dstStep = (dstStep>>2);
int tmp = (dx << AB_BITS);
int X0 = rint(M[0] * tmp);
int Y0 = rint(M[3] * tmp);
X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta;
X0 = X0 >> (AB_BITS - INTER_BITS);
Y0 = Y0 >> (AB_BITS - INTER_BITS);
int sx = (short)(X0 >> INTER_BITS) - 1;
int sy = (short)(Y0 >> INTER_BITS) - 1;
int ay = (short)(Y0 & (INTER_TAB_SIZE-1));
int ax = (short)(X0 & (INTER_TAB_SIZE-1));
uchar4 v[16];
int i,j;
#pragma unroll 4
for(i=0; i<4; i++)
for(j=0; j<4; j++)
{
v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? (src[src_offset+(sy+i) * srcStep + (sx+j)]) : (uchar4)0;
}
int itab[16];
float tab1y[4], tab1x[4];
float axx, ayy;
ayy = INTER_SCALE * ay;
axx = INTER_SCALE * ax;
interpolateCubic(ayy, tab1y);
interpolateCubic(axx, tab1x);
int isum = 0;
#pragma unroll 16
for( i=0; i<16; i++ )
{
float tmp;
tmp = tab1y[(i>>2)] * tab1x[(i&3)] * INTER_REMAP_COEF_SCALE;
itab[i] = rint(tmp);
isum += itab[i];
}
if( isum != INTER_REMAP_COEF_SCALE )
{
int k1, k2;
int diff = isum - INTER_REMAP_COEF_SCALE;
int Mk1=2, Mk2=2, mk1=2, mk2=2;
for( k1 = 2; k1 < 4; k1++ )
for( k2 = 2; k2 < 4; k2++ )
{
if( itab[(k1<<2)+k2] < itab[(mk1<<2)+mk2] )
mk1 = k1, mk2 = k2;
else if( itab[(k1<<2)+k2] > itab[(Mk1<<2)+Mk2] )
Mk1 = k1, Mk2 = k2;
}
diff<0 ? (itab[(Mk1<<2)+Mk2]=(short)(itab[(Mk1<<2)+Mk2]-diff)) : (itab[(mk1<<2)+mk2]=(short)(itab[(mk1<<2)+mk2]-diff));
}
if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
{
int4 sum=0;
for ( i =0; i<16; i++ )
{
sum += convert_int4(v[i]) * itab[i];
}
dst[dst_offset+dy*dstStep+dx] = convert_uchar4_sat( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ;
}
}
}
/**********************************************32FC1********************************************
***********************************************************************************************/
__kernel void warpAffineNN_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows,
int dst_cols, int dst_rows, int srcStep, int dstStep,
int src_offset, int dst_offset, __constant F * M, int threadCols )
{
int dx = get_global_id(0);
int dy = get_global_id(1);
if( dx < threadCols && dy < dst_rows)
{
int round_delta = AB_SCALE/2;
int X0 = rint(M[0] * dx * AB_SCALE);
int Y0 = rint(M[3] * dx * AB_SCALE);
X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta;
short sx0 = (short)(X0 >> AB_BITS);
short sy0 = (short)(Y0 >> AB_BITS);
if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
dst[(dst_offset>>2)+dy*dstStep+dx]= (sx0>=0 && sx0<src_cols && sy0>=0 && sy0<src_rows) ? src[(src_offset>>2)+sy0*srcStep+sx0] : 0;
}
}
__kernel void warpAffineLinear_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows,
int dst_cols, int dst_rows, int srcStep, int dstStep,
int src_offset, int dst_offset, __constant F * M, int threadCols )
{
int dx = get_global_id(0);
int dy = get_global_id(1);
if( dx < threadCols && dy < dst_rows)
{
int round_delta = AB_SCALE/INTER_TAB_SIZE/2;
src_offset = (src_offset>>2);
int X0 = rint(M[0] * dx * AB_SCALE);
int Y0 = rint(M[3] * dx * AB_SCALE);
X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta;
X0 = X0 >> (AB_BITS - INTER_BITS);
Y0 = Y0 >> (AB_BITS - INTER_BITS);
short sx0 = (short)(X0 >> INTER_BITS);
short sy0 = (short)(Y0 >> INTER_BITS);
short ax0 = (short)(X0 & (INTER_TAB_SIZE-1));
short ay0 = (short)(Y0 & (INTER_TAB_SIZE-1));
float v0, v1, v2, v3;
v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0] : 0;
v1 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0+1] : 0;
v2 = (sx0 >= 0 && sx0 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0] : 0;
v3 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0+1] : 0;
float tab[4];
float taby[2], tabx[2];
taby[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay0;
taby[1] = 1.f/INTER_TAB_SIZE*ay0;
tabx[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax0;
tabx[1] = 1.f/INTER_TAB_SIZE*ax0;
tab[0] = taby[0] * tabx[0];
tab[1] = taby[0] * tabx[1];
tab[2] = taby[1] * tabx[0];
tab[3] = taby[1] * tabx[1];
float sum = 0;
sum += v0 * tab[0] + v1 * tab[1] + v2 * tab[2] + v3 * tab[3];
if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
dst[(dst_offset>>2)+dy*dstStep+dx] = sum;
}
}
__kernel void warpAffineCubic_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows,
int dst_cols, int dst_rows, int srcStep, int dstStep,
int src_offset, int dst_offset, __constant F * M, int threadCols )
{
int dx = get_global_id(0);
int dy = get_global_id(1);
if( dx < threadCols && dy < dst_rows)
{
int round_delta = AB_SCALE/INTER_TAB_SIZE/2;
src_offset = (src_offset>>2);
dst_offset = (dst_offset>>2);
int X0 = rint(M[0] * dx * AB_SCALE);
int Y0 = rint(M[3] * dx * AB_SCALE);
X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta;
X0 = X0 >> (AB_BITS - INTER_BITS);
Y0 = Y0 >> (AB_BITS - INTER_BITS);
short sx = (short)(X0 >> INTER_BITS) - 1;
short sy = (short)(Y0 >> INTER_BITS) - 1;
short ay = (short)(Y0 & (INTER_TAB_SIZE-1));
short ax = (short)(X0 & (INTER_TAB_SIZE-1));
float v[16];
int i;
for(i=0; i<16; i++)
v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : 0;
float tab[16];
float tab1y[4], tab1x[4];
float axx, ayy;
ayy = 1.f/INTER_TAB_SIZE * ay;
axx = 1.f/INTER_TAB_SIZE * ax;
interpolateCubic(ayy, tab1y);
interpolateCubic(axx, tab1x);
#pragma unroll 4
for( i=0; i<16; i++ )
{
tab[i] = tab1y[(i>>2)] * tab1x[(i&3)];
}
if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
{
float sum = 0;
#pragma unroll 4
for ( i =0; i<16; i++ )
{
sum += v[i] * tab[i];
}
dst[dst_offset+dy*dstStep+dx] = sum;
}
}
}
/**********************************************32FC4********************************************
***********************************************************************************************/
__kernel void warpAffineNN_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows,
int dst_cols, int dst_rows, int srcStep, int dstStep,
int src_offset, int dst_offset, __constant F * M, int threadCols )
{
int dx = get_global_id(0);
int dy = get_global_id(1);
if( dx < threadCols && dy < dst_rows)
{
int round_delta = AB_SCALE/2;
int X0 = rint(M[0] * dx * AB_SCALE);
int Y0 = rint(M[3] * dx * AB_SCALE);
X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta;
short sx0 = (short)(X0 >> AB_BITS);
short sy0 = (short)(Y0 >> AB_BITS);
if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
dst[(dst_offset>>4)+dy*(dstStep>>2)+dx]= (sx0>=0 && sx0<src_cols && sy0>=0 && sy0<src_rows) ? src[(src_offset>>4)+sy0*(srcStep>>2)+sx0] : (float4)0;
}
}
__kernel void warpAffineLinear_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows,
int dst_cols, int dst_rows, int srcStep, int dstStep,
int src_offset, int dst_offset, __constant F * M, int threadCols )
{
int dx = get_global_id(0);
int dy = get_global_id(1);
if( dx < threadCols && dy < dst_rows)
{
int round_delta = AB_SCALE/INTER_TAB_SIZE/2;
src_offset = (src_offset>>4);
dst_offset = (dst_offset>>4);
srcStep = (srcStep>>2);
dstStep = (dstStep>>2);
int X0 = rint(M[0] * dx * AB_SCALE);
int Y0 = rint(M[3] * dx * AB_SCALE);
X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta;
X0 = X0 >> (AB_BITS - INTER_BITS);
Y0 = Y0 >> (AB_BITS - INTER_BITS);
short sx0 = (short)(X0 >> INTER_BITS);
short sy0 = (short)(Y0 >> INTER_BITS);
short ax0 = (short)(X0 & (INTER_TAB_SIZE-1));
short ay0 = (short)(Y0 & (INTER_TAB_SIZE-1));
float4 v0, v1, v2, v3;
v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0] : (float4)0;
v1 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0+1] : (float4)0;
v2 = (sx0 >= 0 && sx0 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0] : (float4)0;
v3 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0+1] : (float4)0;
float tab[4];
float taby[2], tabx[2];
taby[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay0;
taby[1] = 1.f/INTER_TAB_SIZE*ay0;
tabx[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax0;
tabx[1] = 1.f/INTER_TAB_SIZE*ax0;
tab[0] = taby[0] * tabx[0];
tab[1] = taby[0] * tabx[1];
tab[2] = taby[1] * tabx[0];
tab[3] = taby[1] * tabx[1];
float4 sum = 0;
sum += v0 * tab[0] + v1 * tab[1] + v2 * tab[2] + v3 * tab[3];
if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
dst[dst_offset+dy*dstStep+dx] = sum;
}
}
__kernel void warpAffineCubic_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows,
int dst_cols, int dst_rows, int srcStep, int dstStep,
int src_offset, int dst_offset, __constant F * M, int threadCols )
{
int dx = get_global_id(0);
int dy = get_global_id(1);
if( dx < threadCols && dy < dst_rows)
{
int round_delta = AB_SCALE/INTER_TAB_SIZE/2;
src_offset = (src_offset>>4);
dst_offset = (dst_offset>>4);
srcStep = (srcStep>>2);
dstStep = (dstStep>>2);
int X0 = rint(M[0] * dx * AB_SCALE);
int Y0 = rint(M[3] * dx * AB_SCALE);
X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta;
X0 = X0 >> (AB_BITS - INTER_BITS);
Y0 = Y0 >> (AB_BITS - INTER_BITS);
short sx = (short)(X0 >> INTER_BITS) - 1;
short sy = (short)(Y0 >> INTER_BITS) - 1;
short ay = (short)(Y0 & (INTER_TAB_SIZE-1));
short ax = (short)(X0 & (INTER_TAB_SIZE-1));
float4 v[16];
int i;
for(i=0; i<16; i++)
v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : (float4)0;
float tab[16];
float tab1y[4], tab1x[4];
float axx, ayy;
ayy = 1.f/INTER_TAB_SIZE * ay;
axx = 1.f/INTER_TAB_SIZE * ax;
interpolateCubic(ayy, tab1y);
interpolateCubic(axx, tab1x);
#pragma unroll 4
for( i=0; i<16; i++ )
{
tab[i] = tab1y[(i>>2)] * tab1x[(i&3)];
}
if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
{
float4 sum = 0;
#pragma unroll 4
for ( i =0; i<16; i++ )
{
sum += v[i] * tab[i];
}
dst[dst_offset+dy*dstStep+dx] = sum;
}
}
}

@ -0,0 +1,688 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Zhang Ying, zhangying913@gmail.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
//wrapPerspective kernel
//support data types: CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4, and three interpolation methods: NN, Linear, Cubic.
#if defined (DOUBLE_SUPPORT)
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#elif defined (cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#endif
typedef double F;
typedef double4 F4;
#define convert_F4 convert_double4
#else
typedef float F;
typedef float4 F4;
#define convert_F4 convert_float4
#endif
#define INTER_BITS 5
#define INTER_TAB_SIZE (1 << INTER_BITS)
#define INTER_SCALE 1.f/INTER_TAB_SIZE
#define AB_BITS max(10, (int)INTER_BITS)
#define AB_SCALE (1 << AB_BITS)
#define INTER_REMAP_COEF_BITS 15
#define INTER_REMAP_COEF_SCALE (1 << INTER_REMAP_COEF_BITS)
inline void interpolateCubic( float x, float* coeffs )
{
const float A = -0.75f;
coeffs[0] = ((A*(x + 1.f) - 5.0f*A)*(x + 1.f) + 8.0f*A)*(x + 1.f) - 4.0f*A;
coeffs[1] = ((A + 2.f)*x - (A + 3.f))*x*x + 1.f;
coeffs[2] = ((A + 2.f)*(1.f - x) - (A + 3.f))*(1.f - x)*(1.f - x) + 1.f;
coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2];
}
/**********************************************8UC1*********************************************
***********************************************************************************************/
__kernel void warpPerspectiveNN_C1_D0(__global uchar const * restrict src, __global uchar * dst, int src_cols, int src_rows,
int dst_cols, int dst_rows, int srcStep, int dstStep,
int src_offset, int dst_offset, __constant F * M, int threadCols )
{
int dx = get_global_id(0);
int dy = get_global_id(1);
if( dx < threadCols && dy < dst_rows)
{
dx = (dx<<2) - (dst_offset&3);
F4 DX = (F4)(dx, dx+1, dx+2, dx+3);
F4 X0 = M[0]*DX + M[1]*dy + M[2];
F4 Y0 = M[3]*DX + M[4]*dy + M[5];
F4 W = M[6]*DX + M[7]*dy + M[8],one=1,zero=0;
W = (W!=zero) ? one/W : zero;
short4 X = convert_short4(rint(X0*W));
short4 Y = convert_short4(rint(Y0*W));
int4 sx = convert_int4(X);
int4 sy = convert_int4(Y);
int4 DXD = (int4)(dx, dx+1, dx+2, dx+3);
__global uchar4 * d = (__global uchar4 *)(dst+dst_offset+dy*dstStep+dx);
uchar4 dval = *d;
int4 dcon = DXD >= 0 && DXD < dst_cols && dy >= 0 && dy < dst_rows;
int4 scon = sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows;
int4 spos = src_offset + sy * srcStep + sx;
uchar4 sval;
sval.s0 = scon.s0 ? src[spos.s0] : 0;
sval.s1 = scon.s1 ? src[spos.s1] : 0;
sval.s2 = scon.s2 ? src[spos.s2] : 0;
sval.s3 = scon.s3 ? src[spos.s3] : 0;
dval = convert_uchar4(dcon) != (uchar4)(0,0,0,0) ? sval : dval;
*d = dval;
}
}
__kernel void warpPerspectiveLinear_C1_D0(__global const uchar * restrict src, __global uchar * dst,
int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep,
int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols )
{
int dx = get_global_id(0);
int dy = get_global_id(1);
if( dx < threadCols && dy < dst_rows)
{
F X0 = M[0]*dx + M[1]*dy + M[2];
F Y0 = M[3]*dx + M[4]*dy + M[5];
F W = M[6]*dx + M[7]*dy + M[8];
W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
int X = rint(X0*W);
int Y = rint(Y0*W);
int sx = (short)(X >> INTER_BITS);
int sy = (short)(Y >> INTER_BITS);
int ay = (short)(Y & (INTER_TAB_SIZE-1));
int ax = (short)(X & (INTER_TAB_SIZE-1));
uchar v[4];
int i;
#pragma unroll 4
for(i=0; i<4; i++)
v[i] = (sx+(i&1) >= 0 && sx+(i&1) < src_cols && sy+(i>>1) >= 0 && sy+(i>>1) < src_rows) ? src[src_offset + (sy+(i>>1)) * srcStep + (sx+(i&1))] : (uchar)0;
short itab[4];
float tab1y[2], tab1x[2];
tab1y[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay;
tab1y[1] = 1.f/INTER_TAB_SIZE*ay;
tab1x[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax;
tab1x[1] = 1.f/INTER_TAB_SIZE*ax;
#pragma unroll 4
for(i=0; i<4; i++)
{
float v = tab1y[(i>>1)] * tab1x[(i&1)];
itab[i] = convert_short_sat(rint( v * INTER_REMAP_COEF_SCALE ));
}
if(dx >=0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
{
int sum = 0;
for ( i =0; i<4; i++ )
{
sum += v[i] * itab[i] ;
}
dst[dst_offset+dy*dstStep+dx] = convert_uchar_sat ( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ;
}
}
}
__kernel void warpPerspectiveCubic_C1_D0(__global uchar * src, __global uchar * dst, int src_cols, int src_rows,
int dst_cols, int dst_rows, int srcStep, int dstStep,
int src_offset, int dst_offset, __constant F * M, int threadCols )
{
int dx = get_global_id(0);
int dy = get_global_id(1);
if( dx < threadCols && dy < dst_rows)
{
F X0 = M[0]*dx + M[1]*dy + M[2];
F Y0 = M[3]*dx + M[4]*dy + M[5];
F W = M[6]*dx + M[7]*dy + M[8];
W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
int X = rint(X0*W);
int Y = rint(Y0*W);
short sx = (short)(X >> INTER_BITS) - 1;
short sy = (short)(Y >> INTER_BITS) - 1;
short ay = (short)(Y & (INTER_TAB_SIZE-1));
short ax = (short)(X & (INTER_TAB_SIZE-1));
uchar v[16];
int i, j;
#pragma unroll 4
for(i=0; i<4; i++)
for(j=0; j<4; j++)
{
v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? src[src_offset+(sy+i) * srcStep + (sx+j)] : (uchar)0;
}
short itab[16];
float tab1y[4], tab1x[4];
float axx, ayy;
ayy = 1.f/INTER_TAB_SIZE * ay;
axx = 1.f/INTER_TAB_SIZE * ax;
interpolateCubic(ayy, tab1y);
interpolateCubic(axx, tab1x);
int isum = 0;
#pragma unroll 16
for( i=0; i<16; i++ )
{
F v = tab1y[(i>>2)] * tab1x[(i&3)];
isum += itab[i] = convert_short_sat( rint( v * INTER_REMAP_COEF_SCALE ) );
}
if( isum != INTER_REMAP_COEF_SCALE )
{
int k1, k2;
int diff = isum - INTER_REMAP_COEF_SCALE;
int Mk1=2, Mk2=2, mk1=2, mk2=2;
for( k1 = 2; k1 < 4; k1++ )
for( k2 = 2; k2 < 4; k2++ )
{
if( itab[(k1<<2)+k2] < itab[(mk1<<2)+mk2] )
mk1 = k1, mk2 = k2;
else if( itab[(k1<<2)+k2] > itab[(Mk1<<2)+Mk2] )
Mk1 = k1, Mk2 = k2;
}
diff<0 ? (itab[(Mk1<<2)+Mk2]=(short)(itab[(Mk1<<2)+Mk2]-diff)) : (itab[(mk1<<2)+mk2]=(short)(itab[(mk1<<2)+mk2]-diff));
}
if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
{
int sum=0;
for ( i =0; i<16; i++ )
{
sum += v[i] * itab[i] ;
}
dst[dst_offset+dy*dstStep+dx] = convert_uchar_sat( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ;
}
}
}
/**********************************************8UC4*********************************************
***********************************************************************************************/
__kernel void warpPerspectiveNN_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst,
int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep,
int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols )
{
int dx = get_global_id(0);
int dy = get_global_id(1);
if( dx < threadCols && dy < dst_rows)
{
F X0 = M[0]*dx + M[1]*dy + M[2];
F Y0 = M[3]*dx + M[4]*dy + M[5];
F W = M[6]*dx + M[7]*dy + M[8];
W = (W != 0.0) ? 1./W : 0.0;
int X = rint(X0*W);
int Y = rint(Y0*W);
short sx = (short)X;
short sy = (short)Y;
if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
dst[(dst_offset>>2)+dy*(dstStep>>2)+dx]= (sx>=0 && sx<src_cols && sy>=0 && sy<src_rows) ? src[(src_offset>>2)+sy*(srcStep>>2)+sx] : (uchar4)0;
}
}
__kernel void warpPerspectiveLinear_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst,
int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep,
int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols )
{
int dx = get_global_id(0);
int dy = get_global_id(1);
if( dx < threadCols && dy < dst_rows)
{
src_offset = (src_offset>>2);
srcStep = (srcStep>>2);
F X0 = M[0]*dx + M[1]*dy + M[2];
F Y0 = M[3]*dx + M[4]*dy + M[5];
F W = M[6]*dx + M[7]*dy + M[8];
W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
int X = rint(X0*W);
int Y = rint(Y0*W);
short sx = (short)(X >> INTER_BITS);
short sy = (short)(Y >> INTER_BITS);
short ay = (short)(Y & (INTER_TAB_SIZE-1));
short ax = (short)(X & (INTER_TAB_SIZE-1));
int4 v0, v1, v2, v3;
v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ? convert_int4(src[src_offset+sy * srcStep + sx]) : (int4)0;
v1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows) ? convert_int4(src[src_offset+sy * srcStep + sx+1]) : (int4)0;
v2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? convert_int4(src[src_offset+(sy+1) * srcStep + sx]) : (int4)0;
v3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? convert_int4(src[src_offset+(sy+1) * srcStep + sx+1]) : (int4)0;
int itab0, itab1, itab2, itab3;
float taby, tabx;
taby = 1.f/INTER_TAB_SIZE*ay;
tabx = 1.f/INTER_TAB_SIZE*ax;
itab0 = convert_short_sat(rint( (1.0f-taby)*(1.0f-tabx) * INTER_REMAP_COEF_SCALE ));
itab1 = convert_short_sat(rint( (1.0f-taby)*tabx * INTER_REMAP_COEF_SCALE ));
itab2 = convert_short_sat(rint( taby*(1.0f-tabx) * INTER_REMAP_COEF_SCALE ));
itab3 = convert_short_sat(rint( taby*tabx * INTER_REMAP_COEF_SCALE ));
int4 val;
val = v0 * itab0 + v1 * itab1 + v2 * itab2 + v3 * itab3;
if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
dst[(dst_offset>>2)+dy*(dstStep>>2)+dx] = convert_uchar4_sat ( (val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ;
}
}
__kernel void warpPerspectiveCubic_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst,
int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep,
int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols )
{
int dx = get_global_id(0);
int dy = get_global_id(1);
if( dx < threadCols && dy < dst_rows)
{
src_offset = (src_offset>>2);
srcStep = (srcStep>>2);
dst_offset = (dst_offset>>2);
dstStep = (dstStep>>2);
F X0 = M[0]*dx + M[1]*dy + M[2];
F Y0 = M[3]*dx + M[4]*dy + M[5];
F W = M[6]*dx + M[7]*dy + M[8];
W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
int X = rint(X0*W);
int Y = rint(Y0*W);
short sx = (short)(X >> INTER_BITS) - 1;
short sy = (short)(Y >> INTER_BITS) - 1;
short ay = (short)(Y & (INTER_TAB_SIZE-1));
short ax = (short)(X & (INTER_TAB_SIZE-1));
uchar4 v[16];
int i,j;
#pragma unroll 4
for(i=0; i<4; i++)
for(j=0; j<4; j++)
{
v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? (src[src_offset+(sy+i) * srcStep + (sx+j)]) : (uchar4)0;
}
int itab[16];
float tab1y[4], tab1x[4];
float axx, ayy;
ayy = INTER_SCALE * ay;
axx = INTER_SCALE * ax;
interpolateCubic(ayy, tab1y);
interpolateCubic(axx, tab1x);
int isum = 0;
#pragma unroll 16
for( i=0; i<16; i++ )
{
float tmp;
tmp = tab1y[(i>>2)] * tab1x[(i&3)] * INTER_REMAP_COEF_SCALE;
itab[i] = rint(tmp);
isum += itab[i];
}
if( isum != INTER_REMAP_COEF_SCALE )
{
int k1, k2;
int diff = isum - INTER_REMAP_COEF_SCALE;
int Mk1=2, Mk2=2, mk1=2, mk2=2;
for( k1 = 2; k1 < 4; k1++ )
for( k2 = 2; k2 < 4; k2++ )
{
if( itab[(k1<<2)+k2] < itab[(mk1<<2)+mk2] )
mk1 = k1, mk2 = k2;
else if( itab[(k1<<2)+k2] > itab[(Mk1<<2)+Mk2] )
Mk1 = k1, Mk2 = k2;
}
diff<0 ? (itab[(Mk1<<2)+Mk2]=(short)(itab[(Mk1<<2)+Mk2]-diff)) : (itab[(mk1<<2)+mk2]=(short)(itab[(mk1<<2)+mk2]-diff));
}
if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
{
int4 sum=0;
for ( i =0; i<16; i++ )
{
sum += convert_int4(v[i]) * itab[i];
}
dst[dst_offset+dy*dstStep+dx] = convert_uchar4_sat( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ;
}
}
}
/**********************************************32FC1********************************************
***********************************************************************************************/
__kernel void warpPerspectiveNN_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows,
int dst_cols, int dst_rows, int srcStep, int dstStep,
int src_offset, int dst_offset, __constant F * M, int threadCols )
{
int dx = get_global_id(0);
int dy = get_global_id(1);
if( dx < threadCols && dy < dst_rows)
{
F X0 = M[0]*dx + M[1]*dy + M[2];
F Y0 = M[3]*dx + M[4]*dy + M[5];
F W = M[6]*dx + M[7]*dy + M[8];
W = (W != 0.0) ? 1./W : 0.0;
int X = rint(X0*W);
int Y = rint(Y0*W);
short sx = (short)X;
short sy = (short)Y;
if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
dst[(dst_offset>>2)+dy*dstStep+dx]= (sx>=0 && sx<src_cols && sy>=0 && sy<src_rows) ? src[(src_offset>>2)+sy*srcStep+sx] : 0;
}
}
__kernel void warpPerspectiveLinear_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows,
int dst_cols, int dst_rows, int srcStep, int dstStep,
int src_offset, int dst_offset, __constant F * M, int threadCols )
{
int dx = get_global_id(0);
int dy = get_global_id(1);
if( dx < threadCols && dy < dst_rows)
{
src_offset = (src_offset>>2);
F X0 = M[0]*dx + M[1]*dy + M[2];
F Y0 = M[3]*dx + M[4]*dy + M[5];
F W = M[6]*dx + M[7]*dy + M[8];
W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
int X = rint(X0*W);
int Y = rint(Y0*W);
short sx = (short)(X >> INTER_BITS);
short sy = (short)(Y >> INTER_BITS);
short ay = (short)(Y & (INTER_TAB_SIZE-1));
short ax = (short)(X & (INTER_TAB_SIZE-1));
float v0, v1, v2, v3;
v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ? src[src_offset+sy * srcStep + sx] : (float)0;
v1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows) ? src[src_offset+sy * srcStep + sx+1] : (float)0;
v2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? src[src_offset+(sy+1) * srcStep + sx] : (float)0;
v3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? src[src_offset+(sy+1) * srcStep + sx+1] : (float)0;
float tab[4];
float taby[2], tabx[2];
taby[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay;
taby[1] = 1.f/INTER_TAB_SIZE*ay;
tabx[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax;
tabx[1] = 1.f/INTER_TAB_SIZE*ax;
tab[0] = taby[0] * tabx[0];
tab[1] = taby[0] * tabx[1];
tab[2] = taby[1] * tabx[0];
tab[3] = taby[1] * tabx[1];
float sum = 0;
sum += v0 * tab[0] + v1 * tab[1] + v2 * tab[2] + v3 * tab[3];
if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
dst[(dst_offset>>2)+dy*dstStep+dx] = sum;
}
}
__kernel void warpPerspectiveCubic_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows,
int dst_cols, int dst_rows, int srcStep, int dstStep,
int src_offset, int dst_offset, __constant F * M, int threadCols )
{
int dx = get_global_id(0);
int dy = get_global_id(1);
if( dx < threadCols && dy < dst_rows)
{
src_offset = (src_offset>>2);
dst_offset = (dst_offset>>2);
F X0 = M[0]*dx + M[1]*dy + M[2];
F Y0 = M[3]*dx + M[4]*dy + M[5];
F W = M[6]*dx + M[7]*dy + M[8];
W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
int X = rint(X0*W);
int Y = rint(Y0*W);
short sx = (short)(X >> INTER_BITS) - 1;
short sy = (short)(Y >> INTER_BITS) - 1;
short ay = (short)(Y & (INTER_TAB_SIZE-1));
short ax = (short)(X & (INTER_TAB_SIZE-1));
float v[16];
int i;
for(i=0; i<16; i++)
v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : (float)0;
float tab[16];
float tab1y[4], tab1x[4];
float axx, ayy;
ayy = 1.f/INTER_TAB_SIZE * ay;
axx = 1.f/INTER_TAB_SIZE * ax;
interpolateCubic(ayy, tab1y);
interpolateCubic(axx, tab1x);
#pragma unroll 4
for( i=0; i<16; i++ )
{
tab[i] = tab1y[(i>>2)] * tab1x[(i&3)];
}
if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
{
float sum = 0;
#pragma unroll 4
for ( i =0; i<16; i++ )
{
sum += v[i] * tab[i];
}
dst[dst_offset+dy*dstStep+dx] = sum;
}
}
}
/**********************************************32FC4********************************************
***********************************************************************************************/
__kernel void warpPerspectiveNN_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows,
int dst_cols, int dst_rows, int srcStep, int dstStep,
int src_offset, int dst_offset, __constant F * M, int threadCols )
{
int dx = get_global_id(0);
int dy = get_global_id(1);
if( dx < threadCols && dy < dst_rows)
{
F X0 = M[0]*dx + M[1]*dy + M[2];
F Y0 = M[3]*dx + M[4]*dy + M[5];
F W = M[6]*dx + M[7]*dy + M[8];
W =(W != 0.0)? 1./W : 0.0;
int X = rint(X0*W);
int Y = rint(Y0*W);
short sx = (short)X;
short sy = (short)Y;
if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
dst[(dst_offset>>4)+dy*(dstStep>>2)+dx]= (sx>=0 && sx<src_cols && sy>=0 && sy<src_rows) ? src[(src_offset>>4)+sy*(srcStep>>2)+sx] : (float)0;
}
}
__kernel void warpPerspectiveLinear_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows,
int dst_cols, int dst_rows, int srcStep, int dstStep,
int src_offset, int dst_offset, __constant F * M, int threadCols )
{
int dx = get_global_id(0);
int dy = get_global_id(1);
if( dx < threadCols && dy < dst_rows)
{
src_offset = (src_offset>>4);
dst_offset = (dst_offset>>4);
srcStep = (srcStep>>2);
dstStep = (dstStep>>2);
F X0 = M[0]*dx + M[1]*dy + M[2];
F Y0 = M[3]*dx + M[4]*dy + M[5];
F W = M[6]*dx + M[7]*dy + M[8];
W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
int X = rint(X0*W);
int Y = rint(Y0*W);
short sx0 = (short)(X >> INTER_BITS);
short sy0 = (short)(Y >> INTER_BITS);
short ay0 = (short)(Y & (INTER_TAB_SIZE-1));
short ax0 = (short)(X & (INTER_TAB_SIZE-1));
float4 v0, v1, v2, v3;
v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0] : (float4)0;
v1 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0+1] : (float4)0;
v2 = (sx0 >= 0 && sx0 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0] : (float4)0;
v3 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0+1] : (float4)0;
float tab[4];
float taby[2], tabx[2];
taby[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay0;
taby[1] = 1.f/INTER_TAB_SIZE*ay0;
tabx[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax0;
tabx[1] = 1.f/INTER_TAB_SIZE*ax0;
tab[0] = taby[0] * tabx[0];
tab[1] = taby[0] * tabx[1];
tab[2] = taby[1] * tabx[0];
tab[3] = taby[1] * tabx[1];
float4 sum = 0;
sum += v0 * tab[0] + v1 * tab[1] + v2 * tab[2] + v3 * tab[3];
if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
dst[dst_offset+dy*dstStep+dx] = sum;
}
}
__kernel void warpPerspectiveCubic_C4_D5(__global float4 * src, __global float4 * dst,
int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep,
int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols )
{
int dx = get_global_id(0);
int dy = get_global_id(1);
if( dx < threadCols && dy < dst_rows )
{
src_offset = (src_offset>>4);
dst_offset = (dst_offset>>4);
srcStep = (srcStep>>2);
dstStep = (dstStep>>2);
F X0 = M[0]*dx + M[1]*dy + M[2];
F Y0 = M[3]*dx + M[4]*dy + M[5];
F W = M[6]*dx + M[7]*dy + M[8];
W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
int X = rint(X0*W);
int Y = rint(Y0*W);
short sx = (short)(X >> INTER_BITS)-1;
short sy = (short)(Y >> INTER_BITS)-1;
short ay = (short)(Y & (INTER_TAB_SIZE-1));
short ax = (short)(X & (INTER_TAB_SIZE-1));
float4 v[16];
int i;
for(i=0; i<16; i++)
v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : (float4)0;
float tab[16];
float tab1y[4], tab1x[4];
float axx, ayy;
ayy = 1.f/INTER_TAB_SIZE * ay;
axx = 1.f/INTER_TAB_SIZE * ax;
interpolateCubic(ayy, tab1y);
interpolateCubic(axx, tab1x);
#pragma unroll 4
for( i=0; i<16; i++ )
{
tab[i] = tab1y[(i>>2)] * tab1x[(i&3)];
}
if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
{
float4 sum = 0;
#pragma unroll 4
for ( i =0; i<16; i++ )
{
sum += v[i] * tab[i];
}
dst[dst_offset+dy*dstStep+dx] = sum;
}
}
}

@ -48,6 +48,8 @@
#include "opencv2/imgproc/imgproc_c.h"
#include "opencv2/core/private.hpp"
#include "opencv2/core/ocl.hpp"
#include "opencl_kernels.hpp"
#include <math.h>
#include <assert.h>

@ -0,0 +1,81 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "test_precomp.hpp"
#include <string>
using namespace cv;
using namespace std;
class CV_ImgprocUMatTest : public cvtest::BaseTest
{
public:
CV_ImgprocUMatTest() {}
~CV_ImgprocUMatTest() {}
protected:
void run(int)
{
string imgpath = string(ts->get_data_path()) + "shared/lena.png";
Mat img = imread(imgpath, 1), gray, smallimg, result;
UMat uimg = img.getUMat(ACCESS_READ), ugray, usmallimg, uresult;
cvtColor(img, gray, COLOR_BGR2GRAY);
resize(gray, smallimg, Size(), 0.75, 0.75, INTER_LINEAR);
equalizeHist(smallimg, result);
cvtColor(uimg, ugray, COLOR_BGR2GRAY);
resize(ugray, usmallimg, Size(), 0.75, 0.75, INTER_LINEAR);
equalizeHist(usmallimg, uresult);
imshow("orig", uimg);
imshow("small", usmallimg);
imshow("equalized gray", uresult);
waitKey();
destroyWindow("orig");
destroyWindow("small");
destroyWindow("equalized gray");
ts->set_failed_test_info(cvtest::TS::OK);
}
};
TEST(Imgproc_UMat, regression) { CV_ImgprocUMatTest test; test.safe_run(); }

@ -52,6 +52,8 @@
#include "opencv2/nonfree/cuda.hpp"
#include "opencv2/core/private.cuda.hpp"
#include "opencv2/core/ocl.hpp"
#include "opencv2/opencv_modules.hpp"
#ifdef HAVE_OPENCV_CUDAARITHM

@ -51,6 +51,8 @@
using namespace cv;
using namespace cv::ocl;
static ProgramEntry surf = cv::ocl::nonfree::surf;
namespace cv
{
namespace ocl

@ -159,14 +159,14 @@ public:
CV_WRAP virtual bool empty() const;
CV_WRAP bool load( const String& filename );
virtual bool read( const FileNode& node );
CV_WRAP virtual void detectMultiScale( const Mat& image,
CV_WRAP virtual void detectMultiScale( InputArray image,
CV_OUT std::vector<Rect>& objects,
double scaleFactor = 1.1,
int minNeighbors = 3, int flags = 0,
Size minSize = Size(),
Size maxSize = Size() );
CV_WRAP virtual void detectMultiScale( const Mat& image,
CV_WRAP virtual void detectMultiScale( InputArray image,
CV_OUT std::vector<Rect>& objects,
CV_OUT std::vector<int>& numDetections,
double scaleFactor=1.1,
@ -174,7 +174,7 @@ public:
Size minSize=Size(),
Size maxSize=Size() );
CV_WRAP virtual void detectMultiScale( const Mat& image,
CV_WRAP virtual void detectMultiScale( InputArray image,
CV_OUT std::vector<Rect>& objects,
CV_OUT std::vector<int>& rejectLevels,
CV_OUT std::vector<double>& levelWeights,

@ -1154,13 +1154,14 @@ void CascadeClassifier::detectMultiScaleNoGrouping( const Mat& image, std::vecto
}
}
void CascadeClassifier::detectMultiScale( const Mat& image, std::vector<Rect>& objects,
void CascadeClassifier::detectMultiScale( InputArray _image, std::vector<Rect>& objects,
std::vector<int>& rejectLevels,
std::vector<double>& levelWeights,
double scaleFactor, int minNeighbors,
int flags, Size minObjectSize, Size maxObjectSize,
bool outputRejectLevels )
{
Mat image = _image.getMat();
CV_Assert( scaleFactor > 1 && image.depth() == CV_8U );
if( empty() )
@ -1188,21 +1189,23 @@ void CascadeClassifier::detectMultiScale( const Mat& image, std::vector<Rect>& o
}
}
void CascadeClassifier::detectMultiScale( const Mat& image, std::vector<Rect>& objects,
void CascadeClassifier::detectMultiScale( InputArray _image, std::vector<Rect>& objects,
double scaleFactor, int minNeighbors,
int flags, Size minObjectSize, Size maxObjectSize)
{
Mat image = _image.getMat();
std::vector<int> fakeLevels;
std::vector<double> fakeWeights;
detectMultiScale( image, objects, fakeLevels, fakeWeights, scaleFactor,
minNeighbors, flags, minObjectSize, maxObjectSize );
}
void CascadeClassifier::detectMultiScale( const Mat& image, std::vector<Rect>& objects,
void CascadeClassifier::detectMultiScale( InputArray _image, std::vector<Rect>& objects,
std::vector<int>& numDetections, double scaleFactor,
int minNeighbors, int flags, Size minObjectSize,
Size maxObjectSize )
{
Mat image = _image.getMat();
CV_Assert( scaleFactor > 1 && image.depth() == CV_8U );
if( empty() )

@ -0,0 +1,423 @@
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Niko Li, newlife20080214@gmail.com
// Wang Weiyan, wangweiyanster@gmail.com
// Jia Haipeng, jiahaipeng95@gmail.com
// Nathan, liujun@multicorewareinc.com
// Peng Xiao, pengxiao@outlook.com
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//
#pragma OPENCL EXTENSION cl_amd_printf : enable
#define CV_HAAR_FEATURE_MAX 3
#define calc_sum(rect,offset) (sum[(rect).p0+offset] - sum[(rect).p1+offset] - sum[(rect).p2+offset] + sum[(rect).p3+offset])
#define calc_sum1(rect,offset,i) (sum[(rect).p0[i]+offset] - sum[(rect).p1[i]+offset] - sum[(rect).p2[i]+offset] + sum[(rect).p3[i]+offset])
typedef int sumtype;
typedef float sqsumtype;
#ifndef STUMP_BASED
#define STUMP_BASED 1
#endif
typedef struct __attribute__((aligned (128) )) GpuHidHaarTreeNode
{
int p[CV_HAAR_FEATURE_MAX][4] __attribute__((aligned (64)));
float weight[CV_HAAR_FEATURE_MAX];
float threshold;
float alpha[3] __attribute__((aligned (16)));
int left __attribute__((aligned (4)));
int right __attribute__((aligned (4)));
}
GpuHidHaarTreeNode;
typedef struct __attribute__((aligned (32))) GpuHidHaarClassifier
{
int count __attribute__((aligned (4)));
GpuHidHaarTreeNode* node __attribute__((aligned (8)));
float* alpha __attribute__((aligned (8)));
}
GpuHidHaarClassifier;
typedef struct __attribute__((aligned (64))) GpuHidHaarStageClassifier
{
int count __attribute__((aligned (4)));
float threshold __attribute__((aligned (4)));
int two_rects __attribute__((aligned (4)));
int reserved0 __attribute__((aligned (8)));
int reserved1 __attribute__((aligned (8)));
int reserved2 __attribute__((aligned (8)));
int reserved3 __attribute__((aligned (8)));
}
GpuHidHaarStageClassifier;
typedef struct __attribute__((aligned (64))) GpuHidHaarClassifierCascade
{
int count __attribute__((aligned (4)));
int is_stump_based __attribute__((aligned (4)));
int has_tilted_features __attribute__((aligned (4)));
int is_tree __attribute__((aligned (4)));
int pq0 __attribute__((aligned (4)));
int pq1 __attribute__((aligned (4)));
int pq2 __attribute__((aligned (4)));
int pq3 __attribute__((aligned (4)));
int p0 __attribute__((aligned (4)));
int p1 __attribute__((aligned (4)));
int p2 __attribute__((aligned (4)));
int p3 __attribute__((aligned (4)));
float inv_window_area __attribute__((aligned (4)));
} GpuHidHaarClassifierCascade;
__kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCascade(
global GpuHidHaarStageClassifier * stagecascadeptr,
global int4 * info,
global GpuHidHaarTreeNode * nodeptr,
global const int * restrict sum1,
global const float * restrict sqsum1,
global int4 * candidate,
const int pixelstep,
const int loopcount,
const int start_stage,
const int split_stage,
const int end_stage,
const int startnode,
const int splitnode,
const int4 p,
const int4 pq,
const float correction)
{
int grpszx = get_local_size(0);
int grpszy = get_local_size(1);
int grpnumx = get_num_groups(0);
int grpidx = get_group_id(0);
int lclidx = get_local_id(0);
int lclidy = get_local_id(1);
int lcl_sz = mul24(grpszx,grpszy);
int lcl_id = mad24(lclidy,grpszx,lclidx);
__local int lclshare[1024];
__local int* lcldata = lclshare;//for save win data
__local int* glboutindex = lcldata + 28*28;//for save global out index
__local int* lclcount = glboutindex + 1;//for save the numuber of temp pass pixel
__local int* lcloutindex = lclcount + 1;//for save info of temp pass pixel
__local float* partialsum = (__local float*)(lcloutindex + (lcl_sz<<1));
glboutindex[0]=0;
int outputoff = mul24(grpidx,256);
//assume window size is 20X20
#define WINDOWSIZE 20+1
//make sure readwidth is the multiple of 4
//ystep =1, from host code
int readwidth = ((grpszx-1 + WINDOWSIZE+3)>>2)<<2;
int readheight = grpszy-1+WINDOWSIZE;
int read_horiz_cnt = readwidth >> 2;//each read int4
int total_read = mul24(read_horiz_cnt,readheight);
int read_loop = (total_read + lcl_sz - 1) >> 6;
candidate[outputoff+(lcl_id<<2)] = (int4)0;
candidate[outputoff+(lcl_id<<2)+1] = (int4)0;
candidate[outputoff+(lcl_id<<2)+2] = (int4)0;
candidate[outputoff+(lcl_id<<2)+3] = (int4)0;
for(int scalei = 0; scalei <loopcount; scalei++)
{
int4 scaleinfo1= info[scalei];
int width = (scaleinfo1.x & 0xffff0000) >> 16;
int height = scaleinfo1.x & 0xffff;
int grpnumperline =(scaleinfo1.y & 0xffff0000) >> 16;
int totalgrp = scaleinfo1.y & 0xffff;
int imgoff = scaleinfo1.z;
float factor = as_float(scaleinfo1.w);
__global const int * sum = sum1 + imgoff;
__global const float * sqsum = sqsum1 + imgoff;
for(int grploop=grpidx; grploop<totalgrp; grploop+=grpnumx)
{
int grpidy = grploop / grpnumperline;
int grpidx = grploop - mul24(grpidy, grpnumperline);
int x = mad24(grpidx,grpszx,lclidx);
int y = mad24(grpidy,grpszy,lclidy);
int grpoffx = x-lclidx;
int grpoffy = y-lclidy;
for(int i=0; i<read_loop; i++)
{
int pos_id = mad24(i,lcl_sz,lcl_id);
pos_id = pos_id < total_read ? pos_id : 0;
int lcl_y = pos_id / read_horiz_cnt;
int lcl_x = pos_id - mul24(lcl_y, read_horiz_cnt);
int glb_x = grpoffx + (lcl_x<<2);
int glb_y = grpoffy + lcl_y;
int glb_off = mad24(min(glb_y, height - 1),pixelstep,glb_x);
int4 data = *(__global int4*)&sum[glb_off];
int lcl_off = mad24(lcl_y, readwidth, lcl_x<<2);
vstore4(data, 0, &lcldata[lcl_off]);
}
lcloutindex[lcl_id] = 0;
lclcount[0] = 0;
int result = 1;
int nodecounter= startnode;
float mean, variance_norm_factor;
barrier(CLK_LOCAL_MEM_FENCE);
int lcl_off = mad24(lclidy,readwidth,lclidx);
int4 cascadeinfo1, cascadeinfo2;
cascadeinfo1 = p;
cascadeinfo2 = pq;
cascadeinfo1.x +=lcl_off;
cascadeinfo1.z +=lcl_off;
mean = (lcldata[mad24(cascadeinfo1.y,readwidth,cascadeinfo1.x)] - lcldata[mad24(cascadeinfo1.y,readwidth,cascadeinfo1.z)] -
lcldata[mad24(cascadeinfo1.w,readwidth,cascadeinfo1.x)] + lcldata[mad24(cascadeinfo1.w,readwidth,cascadeinfo1.z)])
*correction;
int p_offset = mad24(y, pixelstep, x);
cascadeinfo2.x +=p_offset;
cascadeinfo2.z +=p_offset;
variance_norm_factor =sqsum[mad24(cascadeinfo2.y, pixelstep, cascadeinfo2.x)] - sqsum[mad24(cascadeinfo2.y, pixelstep, cascadeinfo2.z)] -
sqsum[mad24(cascadeinfo2.w, pixelstep, cascadeinfo2.x)] + sqsum[mad24(cascadeinfo2.w, pixelstep, cascadeinfo2.z)];
variance_norm_factor = variance_norm_factor * correction - mean * mean;
variance_norm_factor = variance_norm_factor >=0.f ? sqrt(variance_norm_factor) : 1.f;
for(int stageloop = start_stage; (stageloop < split_stage) && result; stageloop++ )
{
float stage_sum = 0.f;
int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop);
float stagethreshold = as_float(stageinfo.y);
for(int nodeloop = 0; nodeloop < stageinfo.x; )
{
__global GpuHidHaarTreeNode* currentnodeptr = (nodeptr + nodecounter);
int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0]));
int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0]));
float4 w = *(__global float4*)(&(currentnodeptr->weight[0]));
float3 alpha3 = *(__global float3*)(&(currentnodeptr->alpha[0]));
float nodethreshold = w.w * variance_norm_factor;
info1.x +=lcl_off;
info1.z +=lcl_off;
info2.x +=lcl_off;
info2.z +=lcl_off;
float classsum = (lcldata[mad24(info1.y,readwidth,info1.x)] - lcldata[mad24(info1.y,readwidth,info1.z)] -
lcldata[mad24(info1.w,readwidth,info1.x)] + lcldata[mad24(info1.w,readwidth,info1.z)]) * w.x;
classsum += (lcldata[mad24(info2.y,readwidth,info2.x)] - lcldata[mad24(info2.y,readwidth,info2.z)] -
lcldata[mad24(info2.w,readwidth,info2.x)] + lcldata[mad24(info2.w,readwidth,info2.z)]) * w.y;
info3.x +=lcl_off;
info3.z +=lcl_off;
classsum += (lcldata[mad24(info3.y,readwidth,info3.x)] - lcldata[mad24(info3.y,readwidth,info3.z)] -
lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z;
bool passThres = classsum >= nodethreshold;
#if STUMP_BASED
stage_sum += passThres ? alpha3.y : alpha3.x;
nodecounter++;
nodeloop++;
#else
bool isRootNode = (nodecounter & 1) == 0;
if(isRootNode)
{
if( (passThres && currentnodeptr->right) ||
(!passThres && currentnodeptr->left))
{
nodecounter ++;
}
else
{
stage_sum += alpha3.x;
nodecounter += 2;
nodeloop ++;
}
}
else
{
stage_sum += passThres ? alpha3.z : alpha3.y;
nodecounter ++;
nodeloop ++;
}
#endif
}
result = (stage_sum >= stagethreshold);
}
if(result && (x < width) && (y < height))
{
int queueindex = atomic_inc(lclcount);
lcloutindex[queueindex<<1] = (lclidy << 16) | lclidx;
lcloutindex[(queueindex<<1)+1] = as_int(variance_norm_factor);
}
barrier(CLK_LOCAL_MEM_FENCE);
int queuecount = lclcount[0];
barrier(CLK_LOCAL_MEM_FENCE);
nodecounter = splitnode;
for(int stageloop = split_stage; stageloop< end_stage && queuecount>0; stageloop++)
{
lclcount[0]=0;
barrier(CLK_LOCAL_MEM_FENCE);
int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop);
float stagethreshold = as_float(stageinfo.y);
int perfscale = queuecount > 4 ? 3 : 2;
int queuecount_loop = (queuecount + (1<<perfscale)-1) >> perfscale;
int lcl_compute_win = lcl_sz >> perfscale;
int lcl_compute_win_id = (lcl_id >>(6-perfscale));
int lcl_loops = (stageinfo.x + lcl_compute_win -1) >> (6-perfscale);
int lcl_compute_id = lcl_id - (lcl_compute_win_id << (6-perfscale));
for(int queueloop=0; queueloop<queuecount_loop; queueloop++)
{
float stage_sum = 0.f;
int temp_coord = lcloutindex[lcl_compute_win_id<<1];
float variance_norm_factor = as_float(lcloutindex[(lcl_compute_win_id<<1)+1]);
int queue_pixel = mad24(((temp_coord & (int)0xffff0000)>>16),readwidth,temp_coord & 0xffff);
if(lcl_compute_win_id < queuecount)
{
int tempnodecounter = lcl_compute_id;
float part_sum = 0.f;
const int stump_factor = STUMP_BASED ? 1 : 2;
int root_offset = 0;
for(int lcl_loop=0; lcl_loop<lcl_loops && tempnodecounter<stageinfo.x;)
{
__global GpuHidHaarTreeNode* currentnodeptr =
nodeptr + (nodecounter + tempnodecounter) * stump_factor + root_offset;
int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0]));
int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0]));
float4 w = *(__global float4*)(&(currentnodeptr->weight[0]));
float3 alpha3 = *(__global float3*)(&(currentnodeptr->alpha[0]));
float nodethreshold = w.w * variance_norm_factor;
info1.x +=queue_pixel;
info1.z +=queue_pixel;
info2.x +=queue_pixel;
info2.z +=queue_pixel;
float classsum = (lcldata[mad24(info1.y,readwidth,info1.x)] - lcldata[mad24(info1.y,readwidth,info1.z)] -
lcldata[mad24(info1.w,readwidth,info1.x)] + lcldata[mad24(info1.w,readwidth,info1.z)]) * w.x;
classsum += (lcldata[mad24(info2.y,readwidth,info2.x)] - lcldata[mad24(info2.y,readwidth,info2.z)] -
lcldata[mad24(info2.w,readwidth,info2.x)] + lcldata[mad24(info2.w,readwidth,info2.z)]) * w.y;
info3.x +=queue_pixel;
info3.z +=queue_pixel;
classsum += (lcldata[mad24(info3.y,readwidth,info3.x)] - lcldata[mad24(info3.y,readwidth,info3.z)] -
lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z;
bool passThres = classsum >= nodethreshold;
#if STUMP_BASED
part_sum += passThres ? alpha3.y : alpha3.x;
tempnodecounter += lcl_compute_win;
lcl_loop++;
#else
if(root_offset == 0)
{
if( (passThres && currentnodeptr->right) ||
(!passThres && currentnodeptr->left))
{
root_offset = 1;
}
else
{
part_sum += alpha3.x;
tempnodecounter += lcl_compute_win;
lcl_loop++;
}
}
else
{
part_sum += passThres ? alpha3.z : alpha3.y;
tempnodecounter += lcl_compute_win;
lcl_loop++;
root_offset = 0;
}
#endif
}//end for(int lcl_loop=0;lcl_loop<lcl_loops;lcl_loop++)
partialsum[lcl_id]=part_sum;
}
barrier(CLK_LOCAL_MEM_FENCE);
if(lcl_compute_win_id < queuecount)
{
for(int i=0; i<lcl_compute_win && (lcl_compute_id==0); i++)
{
stage_sum += partialsum[lcl_id+i];
}
if(stage_sum >= stagethreshold && (lcl_compute_id==0))
{
int queueindex = atomic_inc(lclcount);
lcloutindex[queueindex<<1] = temp_coord;
lcloutindex[(queueindex<<1)+1] = as_int(variance_norm_factor);
}
lcl_compute_win_id +=(1<<perfscale);
}
barrier(CLK_LOCAL_MEM_FENCE);
}//end for(int queueloop=0;queueloop<queuecount_loop;queueloop++)
queuecount = lclcount[0];
barrier(CLK_LOCAL_MEM_FENCE);
nodecounter += stageinfo.x;
}//end for(int stageloop = splitstage; stageloop< endstage && queuecount>0;stageloop++)
if(lcl_id<queuecount)
{
int temp = lcloutindex[lcl_id<<1];
int x = mad24(grpidx,grpszx,temp & 0xffff);
int y = mad24(grpidy,grpszy,((temp & (int)0xffff0000) >> 16));
temp = glboutindex[0];
int4 candidate_result;
candidate_result.zw = (int2)convert_int_rtn(factor*20.f);
candidate_result.x = convert_int_rtn(x*factor);
candidate_result.y = convert_int_rtn(y*factor);
atomic_inc(glboutindex);
candidate[outputoff+temp+lcl_id] = candidate_result;
}
barrier(CLK_LOCAL_MEM_FENCE);
}//end for(int grploop=grpidx;grploop<totalgrp;grploop+=grpnumx)
}//end for(int scalei = 0; scalei <loopcount; scalei++)
}

@ -0,0 +1,306 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Wu Xinglong, wxl370@126.com
// Sen Liu, swjtuls1987@126.com
// Peng Xiao, pengxiao@outlook.com
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
// Enter your kernel in this window
//#pragma OPENCL EXTENSION cl_amd_printf:enable
#define CV_HAAR_FEATURE_MAX 3
typedef int sumtype;
typedef float sqsumtype;
typedef struct __attribute__((aligned(128))) GpuHidHaarTreeNode
{
int p[CV_HAAR_FEATURE_MAX][4] __attribute__((aligned(64)));
float weight[CV_HAAR_FEATURE_MAX] /*__attribute__((aligned (16)))*/;
float threshold /*__attribute__((aligned (4)))*/;
float alpha[3] __attribute__((aligned(16)));
int left __attribute__((aligned(4)));
int right __attribute__((aligned(4)));
}
GpuHidHaarTreeNode;
typedef struct __attribute__((aligned(32))) GpuHidHaarClassifier
{
int count __attribute__((aligned(4)));
GpuHidHaarTreeNode *node __attribute__((aligned(8)));
float *alpha __attribute__((aligned(8)));
}
GpuHidHaarClassifier;
typedef struct __attribute__((aligned(64))) GpuHidHaarStageClassifier
{
int count __attribute__((aligned(4)));
float threshold __attribute__((aligned(4)));
int two_rects __attribute__((aligned(4)));
int reserved0 __attribute__((aligned(8)));
int reserved1 __attribute__((aligned(8)));
int reserved2 __attribute__((aligned(8)));
int reserved3 __attribute__((aligned(8)));
}
GpuHidHaarStageClassifier;
typedef struct __attribute__((aligned(64))) GpuHidHaarClassifierCascade
{
int count __attribute__((aligned(4)));
int is_stump_based __attribute__((aligned(4)));
int has_tilted_features __attribute__((aligned(4)));
int is_tree __attribute__((aligned(4)));
int pq0 __attribute__((aligned(4)));
int pq1 __attribute__((aligned(4)));
int pq2 __attribute__((aligned(4)));
int pq3 __attribute__((aligned(4)));
int p0 __attribute__((aligned(4)));
int p1 __attribute__((aligned(4)));
int p2 __attribute__((aligned(4)));
int p3 __attribute__((aligned(4)));
float inv_window_area __attribute__((aligned(4)));
} GpuHidHaarClassifierCascade;
__kernel void gpuRunHaarClassifierCascade_scaled2(
global GpuHidHaarStageClassifier *stagecascadeptr,
global int4 *info,
global GpuHidHaarTreeNode *nodeptr,
global const int *restrict sum,
global const float *restrict sqsum,
global int4 *candidate,
const int rows,
const int cols,
const int step,
const int loopcount,
const int start_stage,
const int split_stage,
const int end_stage,
const int startnode,
global int4 *p,
global float *correction,
const int nodecount)
{
int grpszx = get_local_size(0);
int grpszy = get_local_size(1);
int grpnumx = get_num_groups(0);
int grpidx = get_group_id(0);
int lclidx = get_local_id(0);
int lclidy = get_local_id(1);
int lcl_sz = mul24(grpszx, grpszy);
int lcl_id = mad24(lclidy, grpszx, lclidx);
__local int glboutindex[1];
__local int lclcount[1];
__local int lcloutindex[64];
glboutindex[0] = 0;
int outputoff = mul24(grpidx, 256);
candidate[outputoff + (lcl_id << 2)] = (int4)0;
candidate[outputoff + (lcl_id << 2) + 1] = (int4)0;
candidate[outputoff + (lcl_id << 2) + 2] = (int4)0;
candidate[outputoff + (lcl_id << 2) + 3] = (int4)0;
int max_idx = rows * cols - 1;
for (int scalei = 0; scalei < loopcount; scalei++)
{
int4 scaleinfo1;
scaleinfo1 = info[scalei];
int width = (scaleinfo1.x & 0xffff0000) >> 16;
int height = scaleinfo1.x & 0xffff;
int grpnumperline = (scaleinfo1.y & 0xffff0000) >> 16;
int totalgrp = scaleinfo1.y & 0xffff;
float factor = as_float(scaleinfo1.w);
float correction_t = correction[scalei];
int ystep = (int)(max(2.0f, factor) + 0.5f);
for (int grploop = get_group_id(0); grploop < totalgrp; grploop += grpnumx)
{
int4 cascadeinfo = p[scalei];
int grpidy = grploop / grpnumperline;
int grpidx = grploop - mul24(grpidy, grpnumperline);
int ix = mad24(grpidx, grpszx, lclidx);
int iy = mad24(grpidy, grpszy, lclidy);
int x = ix * ystep;
int y = iy * ystep;
lcloutindex[lcl_id] = 0;
lclcount[0] = 0;
int nodecounter;
float mean, variance_norm_factor;
//if((ix < width) && (iy < height))
{
const int p_offset = mad24(y, step, x);
cascadeinfo.x += p_offset;
cascadeinfo.z += p_offset;
mean = (sum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.x), 0, max_idx)]
- sum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.z), 0, max_idx)] -
sum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.x), 0, max_idx)]
+ sum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.z), 0, max_idx)])
* correction_t;
variance_norm_factor = sqsum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.x), 0, max_idx)]
- sqsum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.z), 0, max_idx)] -
sqsum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.x), 0, max_idx)]
+ sqsum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.z), 0, max_idx)];
variance_norm_factor = variance_norm_factor * correction_t - mean * mean;
variance_norm_factor = variance_norm_factor >= 0.f ? sqrt(variance_norm_factor) : 1.f;
bool result = true;
nodecounter = startnode + nodecount * scalei;
for (int stageloop = start_stage; (stageloop < end_stage) && result; stageloop++)
{
float stage_sum = 0.f;
int stagecount = stagecascadeptr[stageloop].count;
for (int nodeloop = 0; nodeloop < stagecount;)
{
__global GpuHidHaarTreeNode *currentnodeptr = (nodeptr + nodecounter);
int4 info1 = *(__global int4 *)(&(currentnodeptr->p[0][0]));
int4 info2 = *(__global int4 *)(&(currentnodeptr->p[1][0]));
int4 info3 = *(__global int4 *)(&(currentnodeptr->p[2][0]));
float4 w = *(__global float4 *)(&(currentnodeptr->weight[0]));
float3 alpha3 = *(__global float3 *)(&(currentnodeptr->alpha[0]));
float nodethreshold = w.w * variance_norm_factor;
info1.x += p_offset;
info1.z += p_offset;
info2.x += p_offset;
info2.z += p_offset;
info3.x += p_offset;
info3.z += p_offset;
float classsum = (sum[clamp(mad24(info1.y, step, info1.x), 0, max_idx)]
- sum[clamp(mad24(info1.y, step, info1.z), 0, max_idx)] -
sum[clamp(mad24(info1.w, step, info1.x), 0, max_idx)]
+ sum[clamp(mad24(info1.w, step, info1.z), 0, max_idx)]) * w.x;
classsum += (sum[clamp(mad24(info2.y, step, info2.x), 0, max_idx)]
- sum[clamp(mad24(info2.y, step, info2.z), 0, max_idx)] -
sum[clamp(mad24(info2.w, step, info2.x), 0, max_idx)]
+ sum[clamp(mad24(info2.w, step, info2.z), 0, max_idx)]) * w.y;
classsum += (sum[clamp(mad24(info3.y, step, info3.x), 0, max_idx)]
- sum[clamp(mad24(info3.y, step, info3.z), 0, max_idx)] -
sum[clamp(mad24(info3.w, step, info3.x), 0, max_idx)]
+ sum[clamp(mad24(info3.w, step, info3.z), 0, max_idx)]) * w.z;
bool passThres = classsum >= nodethreshold;
#if STUMP_BASED
stage_sum += passThres ? alpha3.y : alpha3.x;
nodecounter++;
nodeloop++;
#else
bool isRootNode = (nodecounter & 1) == 0;
if(isRootNode)
{
if( (passThres && currentnodeptr->right) ||
(!passThres && currentnodeptr->left))
{
nodecounter ++;
}
else
{
stage_sum += alpha3.x;
nodecounter += 2;
nodeloop ++;
}
}
else
{
stage_sum += (passThres ? alpha3.z : alpha3.y);
nodecounter ++;
nodeloop ++;
}
#endif
}
result = (int)(stage_sum >= stagecascadeptr[stageloop].threshold);
}
barrier(CLK_LOCAL_MEM_FENCE);
if (result && (ix < width) && (iy < height))
{
int queueindex = atomic_inc(lclcount);
lcloutindex[queueindex] = (y << 16) | x;
}
barrier(CLK_LOCAL_MEM_FENCE);
int queuecount = lclcount[0];
if (lcl_id < queuecount)
{
int temp = lcloutindex[lcl_id];
int x = temp & 0xffff;
int y = (temp & (int)0xffff0000) >> 16;
temp = atomic_inc(glboutindex);
int4 candidate_result;
candidate_result.zw = (int2)convert_int_rtn(factor * 20.f);
candidate_result.x = x;
candidate_result.y = y;
candidate[outputoff + temp + lcl_id] = candidate_result;
}
barrier(CLK_LOCAL_MEM_FENCE);
}
}
}
}
__kernel void gpuscaleclassifier(global GpuHidHaarTreeNode *orinode, global GpuHidHaarTreeNode *newnode, float scale, float weight_scale, int nodenum)
{
int counter = get_global_id(0);
int tr_x[3], tr_y[3], tr_h[3], tr_w[3], i = 0;
GpuHidHaarTreeNode t1 = *(orinode + counter);
#pragma unroll
for (i = 0; i < 3; i++)
{
tr_x[i] = (int)(t1.p[i][0] * scale + 0.5f);
tr_y[i] = (int)(t1.p[i][1] * scale + 0.5f);
tr_w[i] = (int)(t1.p[i][2] * scale + 0.5f);
tr_h[i] = (int)(t1.p[i][3] * scale + 0.5f);
}
t1.weight[0] = t1.p[2][0] ? -(t1.weight[1] * tr_h[1] * tr_w[1] + t1.weight[2] * tr_h[2] * tr_w[2]) / (tr_h[0] * tr_w[0]) : -t1.weight[1] * tr_h[1] * tr_w[1] / (tr_h[0] * tr_w[0]);
counter += nodenum;
#pragma unroll
for (i = 0; i < 3; i++)
{
newnode[counter].p[i][0] = tr_x[i];
newnode[counter].p[i][1] = tr_y[i];
newnode[counter].p[i][2] = tr_x[i] + tr_w[i];
newnode[counter].p[i][3] = tr_y[i] + tr_h[i];
newnode[counter].weight[i] = t1.weight[i] * weight_scale;
}
newnode[counter].left = t1.left;
newnode[counter].right = t1.right;
newnode[counter].threshold = t1.threshold;
newnode[counter].alpha[0] = t1.alpha[0];
newnode[counter].alpha[1] = t1.alpha[1];
newnode[counter].alpha[2] = t1.alpha[2];
}

@ -49,6 +49,7 @@
#include "opencv2/ml.hpp"
#include "opencv2/core/utility.hpp"
#include "opencv2/core/ocl.hpp"
#include "opencv2/opencv_modules.hpp"
#ifdef HAVE_OPENCV_HIGHGUI

@ -47,6 +47,7 @@
#define __OPENCV_OCL_PRIVATE_UTIL__
#include "opencv2/ocl/cl_runtime/cl_runtime.hpp"
#include "opencv2/core/ocl_genbase.hpp"
#include "opencv2/ocl.hpp"
@ -55,13 +56,6 @@ namespace cv
namespace ocl
{
struct ProgramEntry
{
const char* name;
const char* programStr;
const char* programHash;
};
inline cl_device_id getClDeviceID(const Context *ctx)
{
return *(cl_device_id*)(ctx->getOpenCLDeviceIDPtr());

@ -64,6 +64,8 @@ using namespace cv::ocl;
using namespace cv::superres;
using namespace cv::superres::detail;
static ProgramEntry superres_btvl1 = cv::ocl::superres::superres_btvl1;
namespace cv
{
namespace ocl

@ -56,6 +56,7 @@
#include "opencv2/core/private.hpp"
#include "opencv2/core/private.cuda.hpp"
#include "opencv2/core/ocl.hpp"
#ifdef HAVE_OPENCV_CUDAARITHM
# include "opencv2/cudaarithm.hpp"

@ -0,0 +1,276 @@
#include "opencv2/objdetect.hpp"
#include "opencv2/highgui.hpp"
#include "opencv2/imgproc.hpp"
#include "opencv2/core/utility.hpp"
#include "opencv2/core/ocl.hpp"
#include <cctype>
#include <iostream>
#include <iterator>
#include <stdio.h>
using namespace std;
using namespace cv;
static void help()
{
cout << "\nThis program demonstrates the cascade recognizer. Now you can use Haar or LBP features.\n"
"This classifier can recognize many kinds of rigid objects, once the appropriate classifier is trained.\n"
"It's most known use is for faces.\n"
"Usage:\n"
"./facedetect [--cascade=<cascade_path> this is the primary trained classifier such as frontal face]\n"
" [--nested-cascade[=nested_cascade_path this an optional secondary classifier such as eyes]]\n"
" [--scale=<image scale greater or equal to 1, try 1.3 for example>]\n"
" [--try-flip]\n"
" [filename|camera_index]\n\n"
"see facedetect.cmd for one call:\n"
"./facedetect --cascade=\"../../data/haarcascades/haarcascade_frontalface_alt.xml\" --nested-cascade=\"../../data/haarcascades/haarcascade_eye.xml\" --scale=1.3\n\n"
"During execution:\n\tHit any key to quit.\n"
"\tUsing OpenCV version " << CV_VERSION << "\n" << endl;
}
void detectAndDraw( UMat& img, Mat& canvas, CascadeClassifier& cascade,
CascadeClassifier& nestedCascade,
double scale, bool tryflip );
string cascadeName = "../../data/haarcascades/haarcascade_frontalface_alt.xml";
string nestedCascadeName = "../../data/haarcascades/haarcascade_eye_tree_eyeglasses.xml";
int main( int argc, const char** argv )
{
VideoCapture capture;
UMat frame, image;
Mat canvas;
const string scaleOpt = "--scale=";
size_t scaleOptLen = scaleOpt.length();
const string cascadeOpt = "--cascade=";
size_t cascadeOptLen = cascadeOpt.length();
const string nestedCascadeOpt = "--nested-cascade";
size_t nestedCascadeOptLen = nestedCascadeOpt.length();
const string tryFlipOpt = "--try-flip";
size_t tryFlipOptLen = tryFlipOpt.length();
String inputName;
bool tryflip = false;
help();
CascadeClassifier cascade, nestedCascade;
double scale = 1;
for( int i = 1; i < argc; i++ )
{
cout << "Processing " << i << " " << argv[i] << endl;
if( cascadeOpt.compare( 0, cascadeOptLen, argv[i], cascadeOptLen ) == 0 )
{
cascadeName.assign( argv[i] + cascadeOptLen );
cout << " from which we have cascadeName= " << cascadeName << endl;
}
else if( nestedCascadeOpt.compare( 0, nestedCascadeOptLen, argv[i], nestedCascadeOptLen ) == 0 )
{
if( argv[i][nestedCascadeOpt.length()] == '=' )
nestedCascadeName.assign( argv[i] + nestedCascadeOpt.length() + 1 );
if( !nestedCascade.load( nestedCascadeName ) )
cerr << "WARNING: Could not load classifier cascade for nested objects" << endl;
}
else if( scaleOpt.compare( 0, scaleOptLen, argv[i], scaleOptLen ) == 0 )
{
if( !sscanf( argv[i] + scaleOpt.length(), "%lf", &scale ) || scale > 1 )
scale = 1;
cout << " from which we read scale = " << scale << endl;
}
else if( tryFlipOpt.compare( 0, tryFlipOptLen, argv[i], tryFlipOptLen ) == 0 )
{
tryflip = true;
cout << " will try to flip image horizontally to detect assymetric objects\n";
}
else if( argv[i][0] == '-' )
{
cerr << "WARNING: Unknown option %s" << argv[i] << endl;
}
else
inputName = argv[i];
}
if( !cascade.load( cascadeName ) )
{
cerr << "ERROR: Could not load classifier cascade" << endl;
help();
return -1;
}
if( inputName.empty() || (isdigit(inputName.c_str()[0]) && inputName.c_str()[1] == '\0') )
{
int c = inputName.empty() ? 0 : inputName.c_str()[0] - '0';
if(!capture.open(c))
cout << "Capture from camera #" << c << " didn't work" << endl;
}
else
{
if( inputName.empty() )
inputName = "lena.jpg";
image = imread( inputName, 1 ).getUMat(ACCESS_READ);
if( image.empty() )
{
if(!capture.open( inputName ))
cout << "Could not read " << inputName << endl;
}
}
namedWindow( "result", 1 );
if( capture.isOpened() )
{
cout << "Video capturing has been started ..." << endl;
for(;;)
{
capture >> frame;
if( frame.empty() )
break;
detectAndDraw( frame, canvas, cascade, nestedCascade, scale, tryflip );
if( waitKey( 10 ) >= 0 )
break;
}
}
else
{
cout << "Detecting face(s) in " << inputName << endl;
if( !image.empty() )
{
detectAndDraw( image, canvas, cascade, nestedCascade, scale, tryflip );
waitKey(0);
}
else if( !inputName.empty() )
{
/* assume it is a text file containing the
list of the image filenames to be processed - one per line */
FILE* f = fopen( inputName.c_str(), "rt" );
if( f )
{
char buf[1000+1];
while( fgets( buf, 1000, f ) )
{
int len = (int)strlen(buf), c;
while( len > 0 && isspace(buf[len-1]) )
len--;
buf[len] = '\0';
cout << "file " << buf << endl;
image = imread( buf, 1 ).getUMat(ACCESS_READ);
if( !image.empty() )
{
detectAndDraw( image, canvas, cascade, nestedCascade, scale, tryflip );
c = waitKey(0);
if( c == 27 || c == 'q' || c == 'Q' )
break;
}
else
{
cerr << "Aw snap, couldn't read image " << buf << endl;
}
}
fclose(f);
}
}
}
return 0;
}
void detectAndDraw( UMat& img, Mat& canvas, CascadeClassifier& cascade,
CascadeClassifier& nestedCascade,
double scale0, bool tryflip )
{
int i = 0;
double t = 0, scale=1;
vector<Rect> faces, faces2;
const static Scalar colors[] =
{
Scalar(0,0,255),
Scalar(0,128,255),
Scalar(0,255,255),
Scalar(0,255,0),
Scalar(255,128,0),
Scalar(255,255,0),
Scalar(255,0,0),
Scalar(255,0,255)
};
static UMat gray, smallImg;
t = (double)getTickCount();
cvtColor( img, gray, COLOR_BGR2GRAY );
resize( gray, smallImg, Size(), scale0, scale0, INTER_LINEAR );
cvtColor(smallImg, canvas, COLOR_GRAY2BGR);
equalizeHist( smallImg, smallImg );
cascade.detectMultiScale( smallImg, faces,
1.1, 2, 0
//|CASCADE_FIND_BIGGEST_OBJECT
//|CASCADE_DO_ROUGH_SEARCH
|CASCADE_SCALE_IMAGE
,
Size(30, 30) );
if( tryflip )
{
flip(smallImg, smallImg, 1);
cascade.detectMultiScale( smallImg, faces2,
1.1, 2, 0
//|CASCADE_FIND_BIGGEST_OBJECT
//|CASCADE_DO_ROUGH_SEARCH
|CASCADE_SCALE_IMAGE
,
Size(30, 30) );
for( vector<Rect>::const_iterator r = faces2.begin(); r != faces2.end(); r++ )
{
faces.push_back(Rect(smallImg.cols - r->x - r->width, r->y, r->width, r->height));
}
}
t = (double)getTickCount() - t;
cvtColor(smallImg, canvas, COLOR_GRAY2BGR);
double fps = getTickFrequency()/t;
putText(canvas, format("OpenCL: %s, fps: %.1f", ocl::useOpenCL() ? "ON" : "OFF", fps), Point(250, 50),
FONT_HERSHEY_SIMPLEX, 1, Scalar(0,255,0), 3);
for( vector<Rect>::const_iterator r = faces.begin(); r != faces.end(); r++, i++ )
{
vector<Rect> nestedObjects;
Point center;
Scalar color = colors[i%8];
int radius;
double aspect_ratio = (double)r->width/r->height;
if( 0.75 < aspect_ratio && aspect_ratio < 1.3 )
{
center.x = cvRound((r->x + r->width*0.5)*scale);
center.y = cvRound((r->y + r->height*0.5)*scale);
radius = cvRound((r->width + r->height)*0.25*scale);
circle( canvas, center, radius, color, 3, 8, 0 );
}
else
rectangle( canvas, Point(cvRound(r->x*scale), cvRound(r->y*scale)),
Point(cvRound((r->x + r->width-1)*scale), cvRound((r->y + r->height-1)*scale)),
color, 3, 8, 0);
if( nestedCascade.empty() )
continue;
UMat smallImgROI = smallImg(*r);
nestedCascade.detectMultiScale( smallImgROI, nestedObjects,
1.1, 2, 0
//|CASCADE_FIND_BIGGEST_OBJECT
//|CASCADE_DO_ROUGH_SEARCH
//|CASCADE_DO_CANNY_PRUNING
|CASCADE_SCALE_IMAGE
,
Size(30, 30) );
for( vector<Rect>::const_iterator nr = nestedObjects.begin(); nr != nestedObjects.end(); nr++ )
{
center.x = cvRound((r->x + nr->x + nr->width*0.5)*scale);
center.y = cvRound((r->y + nr->y + nr->height*0.5)*scale);
radius = cvRound((nr->width + nr->height)*0.25*scale);
circle( canvas, center, radius, color, 3, 8, 0 );
}
}
imshow( "result", canvas );
}

@ -11,7 +11,7 @@
using namespace std;
using namespace cv;
#define LOOP_NUM 10
#define LOOP_NUM 1
const static Scalar colors[] = { CV_RGB(0,0,255),
CV_RGB(0,128,255),
@ -83,7 +83,7 @@ int main( int argc, const char** argv )
}
CvCapture* capture = 0;
Mat frame, frameCopy, image;
Mat frame, frameCopy0, frameCopy, image;
bool useCPU = cmd.get<bool>("s");
string inputName = cmd.get<string>("i");
@ -129,16 +129,21 @@ int main( int argc, const char** argv )
if( frame.empty() )
break;
if( iplImg->origin == IPL_ORIGIN_TL )
frame.copyTo( frameCopy );
frame.copyTo( frameCopy0 );
else
flip( frame, frameCopy, 0 );
flip( frame, frameCopy0, 0 );
if( scale == 1)
frameCopy0.copyTo(frameCopy);
else
resize(frameCopy0, frameCopy, Size(), 1./scale, 1./scale, INTER_LINEAR);
work_end = 0;
if(useCPU)
detectCPU(frameCopy, faces, cpu_cascade, scale, false);
detectCPU(frameCopy, faces, cpu_cascade, 1, false);
else
detect(frameCopy, faces, cascade, scale, false);
detect(frameCopy, faces, cascade, 1, false);
Draw(frameCopy, faces, scale);
Draw(frameCopy, faces, 1);
if( waitKey( 10 ) >= 0 )
break;
}
@ -150,6 +155,7 @@ int main( int argc, const char** argv )
vector<Rect> faces;
vector<Rect> ref_rst;
double accuracy = 0.;
work_end = 0;
for(int i = 0; i <= LOOP_NUM; i ++)
{
cout << "loop" << i << endl;
@ -188,7 +194,7 @@ void detect( Mat& img, vector<Rect>& faces,
{
ocl::oclMat image(img);
ocl::oclMat gray, smallImg( cvRound (img.rows/scale), cvRound(img.cols/scale), CV_8UC1 );
if(calTime) workBegin();
workBegin();
ocl::cvtColor( image, gray, COLOR_BGR2GRAY );
ocl::resize( gray, smallImg, smallImg.size(), 0, 0, INTER_LINEAR );
ocl::equalizeHist( smallImg, smallImg );
@ -197,14 +203,14 @@ void detect( Mat& img, vector<Rect>& faces,
3, 0
|CASCADE_SCALE_IMAGE
, Size(30,30), Size(0, 0) );
if(calTime) workEnd();
workEnd();
}
void detectCPU( Mat& img, vector<Rect>& faces,
CascadeClassifier& cascade,
double scale, bool calTime)
{
if(calTime) workBegin();
workBegin();
Mat cpu_gray, cpu_smallImg( cvRound (img.rows/scale), cvRound(img.cols/scale), CV_8UC1 );
cvtColor(img, cpu_gray, COLOR_BGR2GRAY);
resize(cpu_gray, cpu_smallImg, cpu_smallImg.size(), 0, 0, INTER_LINEAR);
@ -212,13 +218,15 @@ void detectCPU( Mat& img, vector<Rect>& faces,
cascade.detectMultiScale(cpu_smallImg, faces, 1.1,
3, 0 | CASCADE_SCALE_IMAGE,
Size(30, 30), Size(0, 0));
if(calTime) workEnd();
workEnd();
}
void Draw(Mat& img, vector<Rect>& faces, double scale)
{
int i = 0;
putText(img, format("fps: %.1f", 1000./getTime()), Point(450, 50),
FONT_HERSHEY_SIMPLEX, 1, Scalar(0,255,0), 3);
for( vector<Rect>::const_iterator r = faces.begin(); r != faces.end(); r++, i++ )
{
Point center;
@ -229,7 +237,7 @@ void Draw(Mat& img, vector<Rect>& faces, double scale)
radius = cvRound((r->width + r->height)*0.25*scale);
circle( img, center, radius, color, 3, 8, 0 );
}
imwrite( outputName, img );
//imwrite( outputName, img );
if(abs(scale-1.0)>.001)
{
resize(img, img, Size((int)(img.cols/scale), (int)(img.rows/scale)));

Loading…
Cancel
Save