diff --git a/cmake/OpenCVModule.cmake b/cmake/OpenCVModule.cmake index 6f2258d97b..ffc998a0f8 100644 --- a/cmake/OpenCVModule.cmake +++ b/cmake/OpenCVModule.cmake @@ -501,9 +501,10 @@ macro(ocv_glob_module_sources) file(GLOB cl_kernels "src/opencl/*.cl") if(HAVE_opencv_ocl AND cl_kernels) ocv_include_directories(${OPENCL_INCLUDE_DIRS}) + string(REGEX REPLACE "opencv_" "" the_module_barename "${the_module}") add_custom_command( OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.hpp" - COMMAND ${CMAKE_COMMAND} -DCL_DIR="${CMAKE_CURRENT_SOURCE_DIR}/src/opencl" -DOUTPUT="${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" -P "${OpenCV_SOURCE_DIR}/cmake/cl2cpp.cmake" + COMMAND ${CMAKE_COMMAND} -DMODULE_NAME="${the_module_barename}" -DCL_DIR="${CMAKE_CURRENT_SOURCE_DIR}/src/opencl" -DOUTPUT="${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" -P "${OpenCV_SOURCE_DIR}/cmake/cl2cpp.cmake" DEPENDS ${cl_kernels} "${OpenCV_SOURCE_DIR}/cmake/cl2cpp.cmake") source_group("OpenCL" FILES ${cl_kernels} "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.hpp") list(APPEND lib_srcs ${cl_kernels} "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.hpp") diff --git a/cmake/cl2cpp.cmake b/cmake/cl2cpp.cmake index 1916c3ee5b..17a9e09983 100644 --- a/cmake/cl2cpp.cmake +++ b/cmake/cl2cpp.cmake @@ -4,6 +4,15 @@ list(SORT cl_list) string(REPLACE ".cpp" ".hpp" OUTPUT_HPP "${OUTPUT}") get_filename_component(OUTPUT_HPP_NAME "${OUTPUT_HPP}" NAME) +if("${MODULE_NAME}" STREQUAL "ocl") + set(nested_namespace_start "") + set(nested_namespace_end "") +else() + set(new_mode ON) + set(nested_namespace_start "namespace ${MODULE_NAME}\n{") + set(nested_namespace_end "}") +endif() + set(STR_CPP "// This file is auto-generated. Do not edit! #include \"precomp.hpp\" @@ -13,16 +22,19 @@ namespace cv { namespace ocl { +${nested_namespace_start} + ") set(STR_HPP "// This file is auto-generated. Do not edit! -#include \"opencv2/ocl/private/util.hpp\" +#include \"opencv2/core/ocl_genbase.hpp\" namespace cv { namespace ocl { +${nested_namespace_start} ") @@ -49,12 +61,19 @@ foreach(cl ${cl_list}) string(MD5 hash "${lines}") - set(STR_CPP "${STR_CPP}const struct ProgramEntry ${cl_filename}={\"${cl_filename}\",\n\"${lines}, \"${hash}\"};\n") - set(STR_HPP "${STR_HPP}extern const struct ProgramEntry ${cl_filename};\n") + set(STR_CPP_DECL "const struct ProgramEntry ${cl_filename}={\"${cl_filename}\",\n\"${lines}, \"${hash}\"};\n") + set(STR_HPP_DECL "extern const struct ProgramEntry ${cl_filename};\n") + if(new_mode) + set(STR_CPP_DECL "${STR_CPP_DECL}ProgramSource2 ${cl_filename}_oclsrc(${cl_filename}.programStr);\n") + set(STR_HPP_DECL "${STR_HPP_DECL}extern ProgramSource2 ${cl_filename}_oclsrc;\n") + endif() + + set(STR_CPP "${STR_CPP}${STR_CPP_DECL}") + set(STR_HPP "${STR_HPP}${STR_HPP_DECL}") endforeach() -set(STR_CPP "${STR_CPP}}\n}\n") -set(STR_HPP "${STR_HPP}}\n}\n") +set(STR_CPP "${STR_CPP}}\n${nested_namespace_end}}\n") +set(STR_HPP "${STR_HPP}}\n${nested_namespace_end}}\n") file(WRITE "${OUTPUT}" "${STR_CPP}") diff --git a/modules/bioinspired/src/precomp.hpp b/modules/bioinspired/src/precomp.hpp index 541b970325..61aeb5409c 100644 --- a/modules/bioinspired/src/precomp.hpp +++ b/modules/bioinspired/src/precomp.hpp @@ -47,6 +47,7 @@ #include "opencv2/bioinspired.hpp" #include "opencv2/core/utility.hpp" #include "opencv2/core/private.hpp" +#include "opencv2/core/ocl.hpp" #include diff --git a/modules/bioinspired/src/retina_ocl.cpp b/modules/bioinspired/src/retina_ocl.cpp index a365ab0971..5d2b4bd15e 100644 --- a/modules/bioinspired/src/retina_ocl.cpp +++ b/modules/bioinspired/src/retina_ocl.cpp @@ -56,6 +56,8 @@ namespace cv { +static ocl::ProgramEntry retina_kernel = ocl::bioinspired::retina_kernel; + namespace bioinspired { namespace ocl diff --git a/modules/core/include/opencv2/core.hpp b/modules/core/include/opencv2/core.hpp index 1fce576d6c..87263fa7d0 100644 --- a/modules/core/include/opencv2/core.hpp +++ b/modules/core/include/opencv2/core.hpp @@ -347,6 +347,10 @@ CV_EXPORTS_W void max(InputArray src1, InputArray src2, OutputArray dst); CV_EXPORTS void min(const Mat& src1, const Mat& src2, Mat& dst); //! computes per-element maximum of two arrays (dst = max(src1, src2)) CV_EXPORTS void max(const Mat& src1, const Mat& src2, Mat& dst); +//! computes per-element minimum of two arrays (dst = min(src1, src2)) +CV_EXPORTS void min(const UMat& src1, const UMat& src2, UMat& dst); +//! computes per-element maximum of two arrays (dst = max(src1, src2)) +CV_EXPORTS void max(const UMat& src1, const UMat& src2, UMat& dst); //! computes square root of each matrix element (dst = src**0.5) CV_EXPORTS_W void sqrt(InputArray src, OutputArray dst); diff --git a/modules/core/include/opencv2/core/mat.hpp b/modules/core/include/opencv2/core/mat.hpp index 4df2432aeb..2f38f8bbb8 100644 --- a/modules/core/include/opencv2/core/mat.hpp +++ b/modules/core/include/opencv2/core/mat.hpp @@ -58,6 +58,8 @@ namespace cv enum { ACCESS_READ=1<<24, ACCESS_WRITE=1<<25, ACCESS_RW=3<<24, ACCESS_MASK=ACCESS_RW, ACCESS_FAST=1<<26 }; +class CV_EXPORTS _OutputArray; + //////////////////////// Input/Output Array Arguments ///////////////////////////////// /*! @@ -116,12 +118,22 @@ public: void* getObj() const; virtual int kind() const; + virtual int dims(int i=-1) const; virtual Size size(int i=-1) const; + virtual int sizend(int* sz, int i=-1) const; + virtual bool sameSize(const _InputArray& arr) const; virtual size_t total(int i=-1) const; virtual int type(int i=-1) const; virtual int depth(int i=-1) const; virtual int channels(int i=-1) const; + virtual bool isContinuous(int i=-1) const; virtual bool empty() const; + virtual void copyTo(const _OutputArray& arr) const; + bool isMat() const; + bool isUMat() const; + bool isMatVectot() const; + bool isUMatVector() const; + bool isMatx(); virtual ~_InputArray(); @@ -197,8 +209,10 @@ public: virtual void create(Size sz, int type, int i=-1, bool allowTransposed=false, int fixedDepthMask=0) const; virtual void create(int rows, int cols, int type, int i=-1, bool allowTransposed=false, int fixedDepthMask=0) const; virtual void create(int dims, const int* size, int type, int i=-1, bool allowTransposed=false, int fixedDepthMask=0) const; + virtual void createSameSize(const _InputArray& arr, int mtype) const; virtual void release() const; virtual void clear() const; + virtual void setTo(const _InputArray& value) const; }; diff --git a/modules/core/include/opencv2/core/mat.inl.hpp b/modules/core/include/opencv2/core/mat.inl.hpp index 3c49984e7f..5e8e6ee600 100644 --- a/modules/core/include/opencv2/core/mat.inl.hpp +++ b/modules/core/include/opencv2/core/mat.inl.hpp @@ -108,6 +108,12 @@ inline _InputArray::_InputArray(const cuda::CudaMem& cuda_mem) inline _InputArray::~_InputArray() {} +inline bool _InputArray::isMat() const { return kind() == _InputArray::MAT; } +inline bool _InputArray::isUMat() const { return kind() == _InputArray::UMAT; } +inline bool _InputArray::isMatVectot() const { return kind() == _InputArray::STD_VECTOR_MAT; } +inline bool _InputArray::isUMatVector() const { return kind() == _InputArray::STD_VECTOR_UMAT; } +inline bool _InputArray::isMatx() { return kind() == _InputArray::MATX; } + //////////////////////////////////////////////////////////////////////////////////////// inline _OutputArray::_OutputArray() { init(ACCESS_WRITE, 0); } diff --git a/modules/core/include/opencv2/core/ocl.hpp b/modules/core/include/opencv2/core/ocl.hpp index 419ccffd5b..9a30962061 100644 --- a/modules/core/include/opencv2/core/ocl.hpp +++ b/modules/core/include/opencv2/core/ocl.hpp @@ -49,13 +49,13 @@ namespace cv { namespace ocl { CV_EXPORTS bool haveOpenCL(); CV_EXPORTS bool useOpenCL(); CV_EXPORTS void setUseOpenCL(bool flag); -CV_EXPORTS void finish(); +CV_EXPORTS void finish2(); -class CV_EXPORTS Context; +class CV_EXPORTS Context2; class CV_EXPORTS Device; class CV_EXPORTS Kernel; class CV_EXPORTS Program; -class CV_EXPORTS ProgramSource; +class CV_EXPORTS ProgramSource2; class CV_EXPORTS Queue; class CV_EXPORTS Device @@ -199,22 +199,22 @@ protected: }; -class CV_EXPORTS Context +class CV_EXPORTS Context2 { public: - Context(); - explicit Context(int dtype); - ~Context(); - Context(const Context& c); - Context& operator = (const Context& c); + Context2(); + explicit Context2(int dtype); + ~Context2(); + Context2(const Context2& c); + Context2& operator = (const Context2& c); bool create(int dtype); size_t ndevices() const; const Device& device(size_t idx) const; - Program getProg(const ProgramSource& prog, + Program getProg(const ProgramSource2& prog, const String& buildopt, String& errmsg); - static Context& getDefault(); + static Context2& getDefault(); void* ptr() const; protected: struct Impl; @@ -226,12 +226,12 @@ class CV_EXPORTS Queue { public: Queue(); - explicit Queue(const Context& c, const Device& d=Device()); + explicit Queue(const Context2& c, const Device& d=Device()); ~Queue(); Queue(const Queue& q); Queue& operator = (const Queue& q); - bool create(const Context& c=Context(), const Device& d=Device()); + bool create(const Context2& c=Context2(), const Device& d=Device()); void finish(); void* ptr() const; static Queue& getDefault(); @@ -245,41 +245,55 @@ protected: class CV_EXPORTS KernelArg { public: - enum { LOCAL=1, READ_ONLY=2, WRITE_ONLY=4, READ_WRITE=6, CONSTANT=8 }; - KernelArg(int _flags, UMat* _m, void* _obj=0, size_t _sz=0); + enum { LOCAL=1, READ_ONLY=2, WRITE_ONLY=4, READ_WRITE=6, CONSTANT=8, NO_SIZE=256 }; + KernelArg(int _flags, UMat* _m, int wscale=1, const void* _obj=0, size_t _sz=0); + KernelArg(); static KernelArg Local() { return KernelArg(LOCAL, 0); } - static KernelArg ReadOnly(const UMat& m) { return KernelArg(READ_ONLY, (UMat*)&m); } - static KernelArg WriteOnly(const UMat& m) { return KernelArg(WRITE_ONLY, (UMat*)&m); } + static KernelArg ReadWrite(const UMat& m, int wscale=1) + { return KernelArg(READ_WRITE, (UMat*)&m, wscale); } + static KernelArg ReadWriteNoSize(const UMat& m, int wscale=1) + { return KernelArg(READ_WRITE+NO_SIZE, (UMat*)&m, wscale); } + static KernelArg ReadOnly(const UMat& m, int wscale=1) + { return KernelArg(READ_ONLY, (UMat*)&m, wscale); } + static KernelArg WriteOnly(const UMat& m, int wscale=1) + { return KernelArg(WRITE_ONLY, (UMat*)&m, wscale); } + static KernelArg ReadOnlyNoSize(const UMat& m, int wscale=1) + { return KernelArg(READ_ONLY+NO_SIZE, (UMat*)&m, wscale); } + static KernelArg WriteOnlyNoSize(const UMat& m, int wscale=1) + { return KernelArg(WRITE_ONLY+NO_SIZE, (UMat*)&m, wscale); } static KernelArg Constant(const Mat& m); template static KernelArg Constant(const _Tp* arr, size_t n) - { return KernelArg(CONSTANT, 0, (void*)arr, n); } + { return KernelArg(CONSTANT, 0, 1, (void*)arr, n); } int flags; UMat* m; - void* obj; + const void* obj; size_t sz; + int wscale; }; + class CV_EXPORTS Kernel { public: Kernel(); Kernel(const char* kname, const Program& prog); - Kernel(const char* kname, const ProgramSource& prog, - const String& buildopts, String& errmsg); + Kernel(const char* kname, const ProgramSource2& prog, + const String& buildopts, String* errmsg=0); ~Kernel(); Kernel(const Kernel& k); Kernel& operator = (const Kernel& k); + bool empty() const; bool create(const char* kname, const Program& prog); - bool create(const char* kname, const ProgramSource& prog, - const String& buildopts, String& errmsg); + bool create(const char* kname, const ProgramSource2& prog, + const String& buildopts, String* errmsg=0); - void set(int i, const void* value, size_t sz); - void set(int i, const UMat& m); - void set(int i, const KernelArg& arg); - template void set(int i, const _Tp& value) + int set(int i, const void* value, size_t sz); + int set(int i, const UMat& m); + int set(int i, const KernelArg& arg); + template int set(int i, const _Tp& value) { return set(i, &value, sizeof(value)); } template @@ -291,26 +305,27 @@ public: template Kernel& args(const _Tp0& a0, const _Tp1& a1) { - set(0, a0); set(1, a1); return *this; + int i = set(0, a0); set(i, a1); return *this; } template Kernel& args(const _Tp0& a0, const _Tp1& a1, const _Tp2& a2) { - set(0, a0); set(1, a1); set(2, a2); return *this; + int i = set(0, a0); i = set(i, a1); set(i, a2); return *this; } template Kernel& args(const _Tp0& a0, const _Tp1& a1, const _Tp2& a2, const _Tp3& a3) { - set(0, a0); set(1, a1); set(2, a2); set(3, a3); return *this; + int i = set(0, a0); i = set(i, a1); i = set(i, a2); i = set(i, a3); return *this; } template Kernel& args(const _Tp0& a0, const _Tp1& a1, const _Tp2& a2, const _Tp3& a3, const _Tp4& a4) { - set(0, a0); set(1, a1); set(2, a2); set(3, a3); set(4, a4); return *this; + int i = set(0, a0); i = set(i, a1); i = set(i, a2); + i = set(i, a3); set(i, a4); return *this; } template 4) ) + return false; + + UMat src1 = _src1.getUMat(), src2; + UMat dst = _dst.getUMat(), mask = _mask.getUMat(); + + char opts[1024]; + int kercn = haveMask || haveScalar ? cn : 1; + sprintf(opts, "-D %s%s -D %s -D dstT=%s", + (haveMask ? "MASK_" : ""), (haveScalar ? "UNARY_OP" : "BINARY_OP"), oclop2str[oclop], + bitwise ? ocl::memopTypeToStr(CV_MAKETYPE(srcdepth, kercn)) : + ocl::typeToStr(CV_MAKETYPE(srcdepth, kercn))); + + ocl::Kernel k("KF", ocl::core::arithm_oclsrc, opts); + if( k.empty() ) + return false; + + int cscale = cn/kercn; + ocl::KernelArg src1arg = ocl::KernelArg::ReadOnlyNoSize(src1, cscale); + ocl::KernelArg dstarg = haveMask ? ocl::KernelArg::ReadWrite(dst, cscale) : + ocl::KernelArg::WriteOnly(dst, cscale); + ocl::KernelArg maskarg = ocl::KernelArg::ReadOnlyNoSize(mask, 1); + + if( haveScalar ) + { + size_t esz = CV_ELEM_SIZE(srctype); + double buf[4] = {0,0,0,0}; + + if( oclop != OCL_OP_NOT ) + { + Mat src2sc = _src2.getMat(); + convertAndUnrollScalar(src2sc, srctype, (uchar*)buf, 1); + } + + ocl::KernelArg scalararg = ocl::KernelArg(0, 0, 0, buf, esz); + + if( !haveMask ) + k.args(src1arg, dstarg, scalararg); + else + k.args(src1arg, maskarg, dstarg, scalararg); + } + else + { + src2 = _src2.getUMat(); + ocl::KernelArg src2arg = ocl::KernelArg::ReadOnlyNoSize(src2, cscale); + + if( !haveMask ) + k.args(src1arg, src2arg, dstarg); + else + k.args(src1arg, src2arg, maskarg, dstarg); + } + + size_t globalsize[] = { src1.cols*(cn/kercn), src1.rows }; + return k.run(2, globalsize, 0, false); +} + + +static void binary_op( InputArray _src1, InputArray _src2, OutputArray _dst, + InputArray _mask, const BinaryFunc* tab, + bool bitwise, int oclop ) +{ + const _InputArray *psrc1 = &_src1, *psrc2 = &_src2; + int kind1 = psrc1->kind(), kind2 = psrc2->kind(); + int type1 = psrc1->type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1); + int type2 = psrc2->type(), depth2 = CV_MAT_DEPTH(type2), cn2 = CV_MAT_CN(type2); + int dims1 = psrc1->dims(), dims2 = psrc2->dims(); + Size sz1 = dims1 <= 2 ? psrc1->size() : Size(); + Size sz2 = dims2 <= 2 ? psrc2->size() : Size(); + bool use_opencl = (kind1 == _InputArray::UMAT || kind2 == _InputArray::UMAT) && + ocl::useOpenCL() && dims1 <= 2 && dims2 <= 2; bool haveMask = !_mask.empty(), haveScalar = false; BinaryFunc func; - int c; - if( src1.dims <= 2 && src2.dims <= 2 && kind1 == kind2 && - src1.size() == src2.size() && src1.type() == src2.type() && !haveMask ) + if( dims1 <= 2 && dims2 <= 2 && kind1 == kind2 && sz1 == sz2 && type1 == type2 && !haveMask ) { - _dst.create(src1.size(), src1.type()); - Mat dst = _dst.getMat(); + _dst.create(sz1, type1); + if( use_opencl && ocl_binary_op(*psrc1, *psrc2, _dst, _mask, bitwise, oclop, false) ) + return; if( bitwise ) { func = *tab; - c = (int)src1.elemSize(); + cn = (int)CV_ELEM_SIZE(type1); } else - { - func = tab[src1.depth()]; - c = src1.channels(); - } + func = tab[depth1]; + Mat src1 = psrc1->getMat(), src2 = psrc2->getMat(), dst = _dst.getMat(); Size sz = getContinuousSize(src1, src2, dst); - size_t len = sz.width*(size_t)c; + size_t len = sz.width*(size_t)cn; if( len == (size_t)(int)len ) { sz.width = (int)len; @@ -946,56 +1025,67 @@ static void binary_op(InputArray _src1, InputArray _src2, OutputArray _dst, } } - if( (kind1 == _InputArray::MATX) + (kind2 == _InputArray::MATX) == 1 || - src1.size != src2.size || src1.type() != src2.type() ) + if( oclop == OCL_OP_NOT ) + haveScalar = true; + else if( (kind1 == _InputArray::MATX) + (kind2 == _InputArray::MATX) == 1 || + !psrc1->sameSize(*psrc2) || type1 != type2 ) { - if( checkScalar(src1, src2.type(), kind1, kind2) ) + if( checkScalar(*psrc1, type2, kind1, kind2) ) + { // src1 is a scalar; swap it with src2 - swap(src1, src2); - else if( !checkScalar(src2, src1.type(), kind2, kind1) ) + swap(psrc1, psrc2); + swap(type1, type2); + swap(depth1, depth2); + swap(cn, cn2); + swap(sz1, sz2); + } + else if( !checkScalar(*psrc2, type1, kind2, kind1) ) CV_Error( CV_StsUnmatchedSizes, "The operation is neither 'array op array' (where arrays have the same size and type), " "nor 'array op scalar', nor 'scalar op array'" ); haveScalar = true; } + else + { + CV_Assert( psrc1->sameSize(*psrc2) && type1 == type2 ); + } - size_t esz = src1.elemSize(); + size_t esz = CV_ELEM_SIZE(type1); size_t blocksize0 = (BLOCK_SIZE + esz-1)/esz; - int cn = src1.channels(); BinaryFunc copymask = 0; - Mat mask; bool reallocate = false; if( haveMask ) { - mask = _mask.getMat(); - CV_Assert( (mask.type() == CV_8UC1 || mask.type() == CV_8SC1) ); - CV_Assert( mask.size == src1.size ); + int mtype = _mask.type(); + CV_Assert( (mtype == CV_8U || mtype == CV_8S) && _mask.sameSize(*psrc1)); copymask = getCopyMaskFunc(esz); - Mat tdst = _dst.getMat(); - reallocate = tdst.size != src1.size || tdst.type() != src1.type(); + reallocate = !_dst.sameSize(*psrc1) || _dst.type() != type1; } AutoBuffer _buf; uchar *scbuf = 0, *maskbuf = 0; - _dst.create(src1.dims, src1.size, src1.type()); - Mat dst = _dst.getMat(); - + _dst.createSameSize(*psrc1, type1); // if this is mask operation and dst has been reallocated, - // we have to + // we have to clear the destination if( haveMask && reallocate ) - dst = Scalar::all(0); + _dst.setTo(0.); + + if( use_opencl && ocl_binary_op(*psrc1, *psrc2, _dst, _mask, bitwise, oclop, haveScalar )) + return; + + Mat src1 = psrc1->getMat(), src2 = psrc2->getMat(); + Mat dst = _dst.getMat(), mask = _mask.getMat(); if( bitwise ) { func = *tab; - c = (int)esz; + cn = (int)esz; } else { - func = tab[src1.depth()]; - c = cn; + func = tab[depth1]; } if( !haveScalar ) @@ -1006,8 +1096,8 @@ static void binary_op(InputArray _src1, InputArray _src2, OutputArray _dst, NAryMatIterator it(arrays, ptrs); size_t total = it.size, blocksize = total; - if( blocksize*c > INT_MAX ) - blocksize = INT_MAX/c; + if( blocksize*cn > INT_MAX ) + blocksize = INT_MAX/cn; if( haveMask ) { @@ -1022,7 +1112,7 @@ static void binary_op(InputArray _src1, InputArray _src2, OutputArray _dst, { int bsz = (int)MIN(total - j, blocksize); - func( ptrs[0], 0, ptrs[1], 0, haveMask ? maskbuf : ptrs[2], 0, Size(bsz*c, 1), 0 ); + func( ptrs[0], 0, ptrs[1], 0, haveMask ? maskbuf : ptrs[2], 0, Size(bsz*cn, 1), 0 ); if( haveMask ) { copymask( maskbuf, 0, ptrs[3], 0, ptrs[2], 0, Size(bsz, 1), &esz ); @@ -1054,7 +1144,7 @@ static void binary_op(InputArray _src1, InputArray _src2, OutputArray _dst, { int bsz = (int)MIN(total - j, blocksize); - func( ptrs[0], 0, scbuf, 0, haveMask ? maskbuf : ptrs[1], 0, Size(bsz*c, 1), 0 ); + func( ptrs[0], 0, scbuf, 0, haveMask ? maskbuf : ptrs[1], 0, Size(bsz*cn, 1), 0 ); if( haveMask ) { copymask( maskbuf, 0, ptrs[2], 0, ptrs[1], 0, Size(bsz, 1), &esz ); @@ -1101,47 +1191,59 @@ static BinaryFunc* getMinTab() void cv::bitwise_and(InputArray a, InputArray b, OutputArray c, InputArray mask) { BinaryFunc f = (BinaryFunc)GET_OPTIMIZED(and8u); - binary_op(a, b, c, mask, &f, true); + binary_op(a, b, c, mask, &f, true, OCL_OP_AND); } void cv::bitwise_or(InputArray a, InputArray b, OutputArray c, InputArray mask) { BinaryFunc f = (BinaryFunc)GET_OPTIMIZED(or8u); - binary_op(a, b, c, mask, &f, true); + binary_op(a, b, c, mask, &f, true, OCL_OP_OR); } void cv::bitwise_xor(InputArray a, InputArray b, OutputArray c, InputArray mask) { BinaryFunc f = (BinaryFunc)GET_OPTIMIZED(xor8u); - binary_op(a, b, c, mask, &f, true); + binary_op(a, b, c, mask, &f, true, OCL_OP_XOR); } void cv::bitwise_not(InputArray a, OutputArray c, InputArray mask) { BinaryFunc f = (BinaryFunc)GET_OPTIMIZED(not8u); - binary_op(a, a, c, mask, &f, true); + binary_op(a, a, c, mask, &f, true, OCL_OP_NOT); } void cv::max( InputArray src1, InputArray src2, OutputArray dst ) { - binary_op(src1, src2, dst, noArray(), getMaxTab(), false ); + binary_op(src1, src2, dst, noArray(), getMaxTab(), false, OCL_OP_MAX ); } void cv::min( InputArray src1, InputArray src2, OutputArray dst ) { - binary_op(src1, src2, dst, noArray(), getMinTab(), false ); + binary_op(src1, src2, dst, noArray(), getMinTab(), false, OCL_OP_MIN ); } void cv::max(const Mat& src1, const Mat& src2, Mat& dst) { OutputArray _dst(dst); - binary_op(src1, src2, _dst, noArray(), getMaxTab(), false ); + binary_op(src1, src2, _dst, noArray(), getMaxTab(), false, OCL_OP_MAX ); } void cv::min(const Mat& src1, const Mat& src2, Mat& dst) { OutputArray _dst(dst); - binary_op(src1, src2, _dst, noArray(), getMinTab(), false ); + binary_op(src1, src2, _dst, noArray(), getMinTab(), false, OCL_OP_MIN ); +} + +void cv::max(const UMat& src1, const UMat& src2, UMat& dst) +{ + OutputArray _dst(dst); + binary_op(src1, src2, _dst, noArray(), getMaxTab(), false, OCL_OP_MAX ); +} + +void cv::min(const UMat& src1, const UMat& src2, UMat& dst) +{ + OutputArray _dst(dst); + binary_op(src1, src2, _dst, noArray(), getMinTab(), false, OCL_OP_MIN ); } @@ -1171,73 +1273,213 @@ static int actualScalarDepth(const double* data, int len) CV_32S; } -static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, - InputArray _mask, int dtype, BinaryFunc* tab, bool muldiv=false, void* usrdata=0) + +static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, + InputArray _mask, int wtype, + void* usrdata, int oclop, + bool haveScalar ) { - int kind1 = _src1.kind(), kind2 = _src2.kind(); - Mat src1 = _src1.getMat(), src2 = _src2.getMat(); + int type1 = _src1.type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1); bool haveMask = !_mask.empty(); - bool reallocate = false; - bool src1Scalar = checkScalar(src1, src2.type(), kind1, kind2); - bool src2Scalar = checkScalar(src2, src1.type(), kind2, kind1); + if( (haveMask || haveScalar) && cn > 4 ) + return false; - if( (kind1 == kind2 || src1.channels() == 1) && src1.dims <= 2 && src2.dims <= 2 && - src1.size() == src2.size() && src1.type() == src2.type() && - !haveMask && ((!_dst.fixedType() && (dtype < 0 || CV_MAT_DEPTH(dtype) == src1.depth())) || - (_dst.fixedType() && _dst.type() == _src1.type())) && + int dtype = _dst.type(), ddepth = CV_MAT_DEPTH(dtype), wdepth = CV_MAT_DEPTH(wtype); + wtype = CV_MAKETYPE(wdepth, cn); + int type2 = haveScalar ? _src2.type() : wtype, depth2 = CV_MAT_DEPTH(type2); + + UMat src1 = _src1.getUMat(), src2; + UMat dst = _dst.getUMat(), mask = _mask.getUMat(); + + char opts[1024]; + int kercn = haveMask || haveScalar ? cn : 1; + + if( (depth1 == depth2 || haveScalar) && ddepth == depth1 && wdepth == depth1 ) + { + const char* oclopstr = oclop2str[oclop]; + if( wdepth <= CV_16S ) + { + oclopstr = oclop == OCL_OP_ADD ? "OCL_OP_ADD_SAT" : + oclop == OCL_OP_SUB ? "OCL_OP_SUB_SAT" : + oclop == OCL_OP_RSUB ? "OCL_OP_RSUB_SAT" : oclopstr; + } + sprintf(opts, "-D %s%s -D %s -D dstT=%s", + (haveMask ? "MASK_" : ""), (haveScalar ? "UNARY_OP" : "BINARY_OP"), + oclop2str[oclop], ocl::typeToStr(CV_MAKETYPE(ddepth, kercn))); + } + else + { + char cvtstr[3][32]; + sprintf(opts, "-D %s%s -D %s -D srcT1=%s -D srcT2=%s " + "-D dstT=%s -D workT=%s -D convertToWT1=%s " + "-D convertToWT2=%s -D convertToDT=%s", + (haveMask ? "MASK_" : ""), (haveScalar ? "UNARY_OP" : "BINARY_OP"), + oclop2str[oclop], ocl::typeToStr(CV_MAKETYPE(depth1, kercn)), + ocl::typeToStr(CV_MAKETYPE(depth2, kercn)), + ocl::typeToStr(CV_MAKETYPE(ddepth, kercn)), + ocl::typeToStr(CV_MAKETYPE(wdepth, kercn)), + ocl::convertTypeStr(depth1, wdepth, kercn, cvtstr[0]), + ocl::convertTypeStr(depth2, wdepth, kercn, cvtstr[1]), + ocl::convertTypeStr(wdepth, ddepth, kercn, cvtstr[2])); + } + + const uchar* usrdata_p = (const uchar*)usrdata; + const double* usrdata_d = (const double*)usrdata; + float usrdata_f[3]; + int i, n = oclop == OCL_OP_MUL_SCALE || oclop == OCL_OP_DIV_SCALE || + oclop == OCL_OP_RECIP_SCALE ? 1 : oclop == OCL_OP_ADDW ? 3 : 0; + if( n > 0 && wdepth == CV_32F ) + { + for( i = 0; i < n; i++ ) + usrdata_f[i] = (float)usrdata_d[i]; + usrdata_p = (const uchar*)usrdata_f; + } + size_t usrdata_esz = CV_ELEM_SIZE(wdepth); + + ocl::Kernel k("KF", ocl::core::arithm_oclsrc, opts); + if( k.empty() ) + return false; + + int cscale = cn/kercn; + + ocl::KernelArg src1arg = ocl::KernelArg::ReadOnlyNoSize(src1, cscale); + ocl::KernelArg dstarg = haveMask ? ocl::KernelArg::ReadWrite(dst, cscale) : + ocl::KernelArg::WriteOnly(dst, cscale); + ocl::KernelArg maskarg = ocl::KernelArg::ReadOnlyNoSize(mask, 1); + + if( haveScalar ) + { + size_t esz = CV_ELEM_SIZE(wtype); + double buf[4]={0,0,0,0}; + Mat src2sc = _src2.getMat(); + + if( !src2sc.empty() ) + { + convertAndUnrollScalar(src2sc, wtype, (uchar*)buf, 1); + } + ocl::KernelArg scalararg = ocl::KernelArg(0, 0, 0, buf, esz); + + if( !haveMask ) + k.args(src1arg, dstarg, scalararg); + else + k.args(src1arg, maskarg, dstarg, scalararg); + } + else + { + src2 = _src2.getUMat(); + ocl::KernelArg src2arg = ocl::KernelArg::ReadOnlyNoSize(src2, cscale); + + if( !haveMask ) + { + if(n == 0) + k.args(src1arg, src2arg, dstarg); + else if(n == 1) + k.args(src1arg, src2arg, dstarg, + ocl::KernelArg(0, 0, 0, usrdata_p, usrdata_esz)); + else if(n == 3) + k.args(src1arg, src2arg, dstarg, + ocl::KernelArg(0, 0, 0, usrdata_p, usrdata_esz), + ocl::KernelArg(0, 0, 0, usrdata_p + usrdata_esz, usrdata_esz), + ocl::KernelArg(0, 0, 0, usrdata_p + usrdata_esz*2, usrdata_esz)); + else + CV_Error(Error::StsNotImplemented, "unsupported number of extra parameters"); + } + else + { + k.args(src1arg, src2arg, maskarg, dstarg); + } + } + + size_t globalsize[] = { src1.cols*(cn/kercn), src1.rows }; + return k.run(2, globalsize, 0, false); +} + + +static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, + InputArray _mask, int dtype, BinaryFunc* tab, bool muldiv=false, + void* usrdata=0, int oclop=-1 ) +{ + const _InputArray *psrc1 = &_src1, *psrc2 = &_src2; + int kind1 = psrc1->kind(), kind2 = psrc2->kind(); + bool haveMask = !_mask.empty(); + bool reallocate = false; + int type1 = psrc1->type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1); + int type2 = psrc2->type(), depth2 = CV_MAT_DEPTH(type2), cn2 = CV_MAT_CN(type2); + int wtype, dims1 = psrc1->dims(), dims2 = psrc2->dims(); + Size sz1 = dims1 <= 2 ? psrc1->size() : Size(); + Size sz2 = dims2 <= 2 ? psrc2->size() : Size(); + bool use_opencl = (kind1 == _InputArray::UMAT || kind2 == _InputArray::UMAT) && + ocl::useOpenCL() && dims1 <= 2 && dims2 <= 2; + bool src1Scalar = checkScalar(*psrc1, type2, kind1, kind2); + bool src2Scalar = checkScalar(*psrc2, type1, kind2, kind1); + + if( (kind1 == kind2 || cn == 1) && sz1 == sz2 && dims1 <= 2 && dims2 <= 2 && type1 == type2 && + !haveMask && ((!_dst.fixedType() && (dtype < 0 || CV_MAT_DEPTH(dtype) == depth1)) || + (_dst.fixedType() && _dst.type() == type1)) && ((src1Scalar && src2Scalar) || (!src1Scalar && !src2Scalar)) ) { - _dst.create(src1.size(), src1.type()); - Mat dst = _dst.getMat(); + _dst.createSameSize(*psrc1, type1); + if( use_opencl && + ocl_arithm_op(*psrc1, *psrc2, _dst, _mask, + (!usrdata ? type1 : std::max(depth1, CV_32F)), + usrdata, oclop, false)) + return; + Mat src1 = psrc1->getMat(), src2 = psrc2->getMat(), dst = _dst.getMat(); Size sz = getContinuousSize(src1, src2, dst, src1.channels()); - tab[src1.depth()](src1.data, src1.step, src2.data, src2.step, dst.data, dst.step, sz, usrdata); + tab[depth1](src1.data, src1.step, src2.data, src2.step, dst.data, dst.step, sz, usrdata); return; } bool haveScalar = false, swapped12 = false; - int depth2 = src2.depth(); - if( src1.size != src2.size || src1.channels() != src2.channels() || + + if( dims1 != dims2 || sz1 != sz2 || cn != cn2 || ((kind1 == _InputArray::MATX || kind2 == _InputArray::MATX) && - src1.cols == 1 && src2.rows == 4) ) + (sz1 == Size(1,4) || sz2 == Size(1,4))) ) { - if( checkScalar(src1, src2.type(), kind1, kind2) ) + if( checkScalar(*psrc1, type2, kind1, kind2) ) { // src1 is a scalar; swap it with src2 - swap(src1, src2); + swap(psrc1, psrc2); + swap(sz1, sz2); + swap(type1, type2); + swap(depth1, depth2); + swap(cn, cn2); + swap(dims1, dims2); swapped12 = true; + if( oclop == OCL_OP_SUB ) + oclop = OCL_OP_RSUB; } - else if( !checkScalar(src2, src1.type(), kind2, kind1) ) + else if( !checkScalar(*psrc2, type1, kind2, kind1) ) CV_Error( CV_StsUnmatchedSizes, - "The operation is neither 'array op array' (where arrays have the same size and the same number of channels), " + "The operation is neither 'array op array' " + "(where arrays have the same size and the same number of channels), " "nor 'array op scalar', nor 'scalar op array'" ); haveScalar = true; - CV_Assert(src2.type() == CV_64F && (src2.rows == 4 || src2.rows == 1)); + CV_Assert(type2 == CV_64F && (sz2.height == 1 || sz2.height == 4)); if (!muldiv) { - depth2 = actualScalarDepth(src2.ptr(), src1.channels()); - if( depth2 == CV_64F && (src1.depth() < CV_32S || src1.depth() == CV_32F) ) + Mat sc = psrc2->getMat(); + depth2 = actualScalarDepth(sc.ptr(), cn); + if( depth2 == CV_64F && (depth1 < CV_32S || depth1 == CV_32F) ) depth2 = CV_32F; } else depth2 = CV_64F; } - int cn = src1.channels(), depth1 = src1.depth(), wtype; - BinaryFunc cvtsrc1 = 0, cvtsrc2 = 0, cvtdst = 0; - if( dtype < 0 ) { if( _dst.fixedType() ) dtype = _dst.type(); else { - if( !haveScalar && src1.type() != src2.type() ) + if( !haveScalar && type1 != type2 ) CV_Error(CV_StsBadArg, "When the input arrays in add/subtract/multiply/divide functions have different types, " "the output array type must be explicitly specified"); - dtype = src1.type(); + dtype = type1; } } dtype = CV_MAT_DEPTH(dtype); @@ -1262,39 +1504,41 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, wtype = std::max(wtype, dtype); } - cvtsrc1 = depth1 == wtype ? 0 : getConvertFunc(depth1, wtype); - cvtsrc2 = depth2 == depth1 ? cvtsrc1 : depth2 == wtype ? 0 : getConvertFunc(depth2, wtype); - cvtdst = dtype == wtype ? 0 : getConvertFunc(wtype, dtype); - dtype = CV_MAKETYPE(dtype, cn); wtype = CV_MAKETYPE(wtype, cn); - size_t esz1 = src1.elemSize(), esz2 = src2.elemSize(); - size_t dsz = CV_ELEM_SIZE(dtype), wsz = CV_ELEM_SIZE(wtype); - size_t blocksize0 = (size_t)(BLOCK_SIZE + wsz-1)/wsz; - BinaryFunc copymask = 0; - Mat mask; - if( haveMask ) { - mask = _mask.getMat(); - CV_Assert( (mask.type() == CV_8UC1 || mask.type() == CV_8SC1) ); - CV_Assert( mask.size == src1.size ); - copymask = getCopyMaskFunc(dsz); - Mat tdst = _dst.getMat(); - reallocate = tdst.size != src1.size || tdst.type() != dtype; + int mtype = _mask.type(); + CV_Assert( (mtype == CV_8UC1 || mtype == CV_8SC1) && _mask.sameSize(*psrc1) ); + reallocate = !_dst.sameSize(*psrc1) || _dst.type() != dtype; } - AutoBuffer _buf; - uchar *buf, *maskbuf = 0, *buf1 = 0, *buf2 = 0, *wbuf = 0; - size_t bufesz = (cvtsrc1 ? wsz : 0) + (cvtsrc2 || haveScalar ? wsz : 0) + (cvtdst ? wsz : 0) + (haveMask ? dsz : 0); + _dst.createSameSize(*psrc1, dtype); + if( reallocate ) + _dst.setTo(0.); - _dst.create(src1.dims, src1.size, dtype); - Mat dst = _dst.getMat(); + if( use_opencl && + ocl_arithm_op(*psrc1, *psrc2, _dst, _mask, wtype, + usrdata, oclop, haveScalar)) + return; - if( haveMask && reallocate ) - dst = Scalar::all(0); + BinaryFunc cvtsrc1 = type1 == wtype ? 0 : getConvertFunc(type1, wtype); + BinaryFunc cvtsrc2 = type2 == type1 ? cvtsrc1 : type2 == wtype ? 0 : getConvertFunc(type2, wtype); + BinaryFunc cvtdst = dtype == wtype ? 0 : getConvertFunc(wtype, dtype); + + size_t esz1 = CV_ELEM_SIZE(type1), esz2 = CV_ELEM_SIZE(type2); + size_t dsz = CV_ELEM_SIZE(dtype), wsz = CV_ELEM_SIZE(wtype); + size_t blocksize0 = (size_t)(BLOCK_SIZE + wsz-1)/wsz; + BinaryFunc copymask = getCopyMaskFunc(dsz); + Mat src1 = psrc1->getMat(), src2 = psrc2->getMat(), dst = _dst.getMat(), mask = _mask.getMat(); + AutoBuffer _buf; + uchar *buf, *maskbuf = 0, *buf1 = 0, *buf2 = 0, *wbuf = 0; + size_t bufesz = (cvtsrc1 ? wsz : 0) + + (cvtsrc2 || haveScalar ? wsz : 0) + + (cvtdst ? wsz : 0) + + (haveMask ? dsz : 0); BinaryFunc func = tab[CV_MAT_DEPTH(wtype)]; if( !haveScalar ) @@ -1476,7 +1720,7 @@ static BinaryFunc* getAbsDiffTab() void cv::add( InputArray src1, InputArray src2, OutputArray dst, InputArray mask, int dtype ) { - arithm_op(src1, src2, dst, mask, dtype, getAddTab() ); + arithm_op(src1, src2, dst, mask, dtype, getAddTab(), false, 0, OCL_OP_ADD ); } void cv::subtract( InputArray src1, InputArray src2, OutputArray dst, @@ -1511,12 +1755,12 @@ void cv::subtract( InputArray src1, InputArray src2, OutputArray dst, } } #endif - arithm_op(src1, src2, dst, mask, dtype, getSubTab() ); + arithm_op(src1, src2, dst, mask, dtype, getSubTab(), false, 0, OCL_OP_SUB ); } void cv::absdiff( InputArray src1, InputArray src2, OutputArray dst ) { - arithm_op(src1, src2, dst, noArray(), -1, getAbsDiffTab()); + arithm_op(src1, src2, dst, noArray(), -1, getAbsDiffTab(), false, 0, OCL_OP_ABSDIFF); } /****************************************************************************************\ @@ -1847,19 +2091,20 @@ static BinaryFunc* getRecipTab() void cv::multiply(InputArray src1, InputArray src2, OutputArray dst, double scale, int dtype) { - arithm_op(src1, src2, dst, noArray(), dtype, getMulTab(), true, &scale); + arithm_op(src1, src2, dst, noArray(), dtype, getMulTab(), + true, &scale, scale == 1. ? OCL_OP_MUL : OCL_OP_MUL_SCALE); } void cv::divide(InputArray src1, InputArray src2, OutputArray dst, double scale, int dtype) { - arithm_op(src1, src2, dst, noArray(), dtype, getDivTab(), true, &scale); + arithm_op(src1, src2, dst, noArray(), dtype, getDivTab(), true, &scale, OCL_OP_DIV_SCALE); } void cv::divide(double scale, InputArray src2, OutputArray dst, int dtype) { - arithm_op(src2, src2, dst, noArray(), dtype, getRecipTab(), true, &scale); + arithm_op(src2, src2, dst, noArray(), dtype, getRecipTab(), true, &scale, OCL_OP_RECIP_SCALE); } /****************************************************************************************\ @@ -2020,7 +2265,7 @@ void cv::addWeighted( InputArray src1, double alpha, InputArray src2, double beta, double gamma, OutputArray dst, int dtype ) { double scalars[] = {alpha, beta, gamma}; - arithm_op(src1, src2, dst, noArray(), dtype, getAddWeightedTab(), true, scalars); + arithm_op(src1, src2, dst, noArray(), dtype, getAddWeightedTab(), true, scalars, OCL_OP_ADDW); } diff --git a/modules/core/src/copy.cpp b/modules/core/src/copy.cpp index 1fb448f2a6..ee34ef4516 100644 --- a/modules/core/src/copy.cpp +++ b/modules/core/src/copy.cpp @@ -220,6 +220,21 @@ void Mat::copyTo( OutputArray _dst ) const return; } + if( _dst.isUMat() ) + { + _dst.create( dims, size.p, type() ); + UMat dst = _dst.getUMat(); + + size_t i, sz[CV_MAX_DIM], dstofs[CV_MAX_DIM], esz = elemSize(); + for( i = 0; i < (size_t)dims; i++ ) + sz[i] = size.p[i]; + sz[dims-1] *= esz; + dst.ndoffset(dstofs); + dstofs[dims-1] *= esz; + dst.u->currAllocator->upload(dst.u, data, dims, sz, dstofs, dst.step.p, step.p); + return; + } + if( dims <= 2 ) { _dst.create( rows, cols, type() ); diff --git a/modules/core/src/matrix.cpp b/modules/core/src/matrix.cpp index e64bae42c9..263c0f662b 100644 --- a/modules/core/src/matrix.cpp +++ b/modules/core/src/matrix.cpp @@ -1436,6 +1436,181 @@ Size _InputArray::size(int i) const } } + +int _InputArray::sizend(int* sz, int i) const +{ + int j, d=0, k = kind(); + + if( k == NONE ) + ; + else if( k == MAT ) + { + CV_Assert( i < 0 ); + const Mat& m = *(const Mat*)obj; + d = m.dims; + if(sz) + for(j = 0; j < d; j++) + sz[j] = m.size.p[j]; + } + else if( k == UMAT ) + { + CV_Assert( i < 0 ); + const UMat& m = *(const UMat*)obj; + d = m.dims; + if(sz) + for(j = 0; j < d; j++) + sz[j] = m.size.p[j]; + } + else if( k == STD_VECTOR_MAT && i >= 0 ) + { + const std::vector& vv = *(const std::vector*)obj; + CV_Assert( i < (int)vv.size() ); + const Mat& m = vv[i]; + d = m.dims; + if(sz) + for(j = 0; j < d; j++) + sz[j] = m.size.p[j]; + } + else if( k == STD_VECTOR_UMAT && i >= 0 ) + { + const std::vector& vv = *(const std::vector*)obj; + CV_Assert( i < (int)vv.size() ); + const UMat& m = vv[i]; + d = m.dims; + if(sz) + for(j = 0; j < d; j++) + sz[j] = m.size.p[j]; + } + else + { + Size sz2d = size(i); + d = 2; + if(sz) + { + sz[0] = sz2d.height; + sz[1] = sz2d.width; + } + } + + return d; +} + + +bool _InputArray::sameSize(const _InputArray& arr) const +{ + int k1 = kind(), k2 = arr.kind(); + Size sz1; + + if( k1 == MAT ) + { + const Mat* m = ((const Mat*)obj); + if( k2 == MAT ) + return m->size == ((const Mat*)arr.obj)->size; + if( k2 == UMAT ) + return m->size == ((const UMat*)arr.obj)->size; + if( m->dims > 2 ) + return false; + sz1 = m->size(); + } + else if( k1 == UMAT ) + { + const UMat* m = ((const UMat*)obj); + if( k2 == MAT ) + return m->size == ((const Mat*)arr.obj)->size; + if( k2 == UMAT ) + return m->size == ((const UMat*)arr.obj)->size; + if( m->dims > 2 ) + return false; + sz1 = m->size(); + } + else + sz1 = size(); + if( arr.dims() > 2 ) + return false; + return sz1 == arr.size(); +} + +int _InputArray::dims(int i) const +{ + int k = kind(); + + if( k == MAT ) + { + CV_Assert( i < 0 ); + return ((const Mat*)obj)->dims; + } + + if( k == EXPR ) + { + CV_Assert( i < 0 ); + return ((const MatExpr*)obj)->a.dims; + } + + if( k == UMAT ) + { + CV_Assert( i < 0 ); + return ((const UMat*)obj)->dims; + } + + if( k == MATX ) + { + CV_Assert( i < 0 ); + return 2; + } + + if( k == STD_VECTOR ) + { + CV_Assert( i < 0 ); + return 2; + } + + if( k == NONE ) + return 0; + + if( k == STD_VECTOR_VECTOR ) + { + const std::vector >& vv = *(const std::vector >*)obj; + if( i < 0 ) + return 1; + CV_Assert( i < (int)vv.size() ); + return 2; + } + + if( k == STD_VECTOR_MAT ) + { + const std::vector& vv = *(const std::vector*)obj; + if( i < 0 ) + return 1; + CV_Assert( i < (int)vv.size() ); + + return vv[i].dims; + } + + if( k == OPENGL_BUFFER ) + { + CV_Assert( i < 0 ); + return 2; + } + + if( k == GPU_MAT ) + { + CV_Assert( i < 0 ); + return 2; + } + + if( k == OCL_MAT ) + { + return 2; + } + + CV_Assert( k == CUDA_MEM ); + //if( k == CUDA_MEM ) + { + CV_Assert( i < 0 ); + return 2; + } +} + size_t _InputArray::total(int i) const { int k = kind(); @@ -1570,6 +1745,61 @@ bool _InputArray::empty() const return ((const cuda::CudaMem*)obj)->empty(); } +bool _InputArray::isContinuous(int i) const +{ + int k = kind(); + + if( k == MAT ) + return i < 0 ? ((const Mat*)obj)->isContinuous() : true; + + if( k == UMAT ) + return i < 0 ? ((const UMat*)obj)->isContinuous() : true; + + if( k == EXPR || k == MATX || k == STD_VECTOR || k == NONE || k == STD_VECTOR_VECTOR) + return true; + + if( k == STD_VECTOR_MAT ) + { + const std::vector& vv = *(const std::vector*)obj; + CV_Assert((size_t)i < vv.size()); + return vv[i].isContinuous(); + } + + if( k == STD_VECTOR_UMAT ) + { + const std::vector& vv = *(const std::vector*)obj; + CV_Assert((size_t)i < vv.size()); + return vv[i].isContinuous(); + } + + CV_Error(CV_StsNotImplemented, "This method is not implemented for oclMat yet"); + return false; +} + +void _InputArray::copyTo(const _OutputArray& arr) const +{ + int k = kind(); + + if( k == NONE ) + arr.release(); + else if( k == MAT || k == MATX || k == STD_VECTOR ) + { + Mat m = getMat(); + m.copyTo(arr); + } + else if( k == EXPR ) + { + const MatExpr& e = *((MatExpr*)obj); + if( arr.kind() == MAT ) + arr.getMatRef() = e; + else + Mat(e).copyTo(arr); + } + else if( k == UMAT ) + ((UMat*)obj)->copyTo(arr); + else + CV_Error(Error::StsNotImplemented, ""); +} bool _OutputArray::fixedSize() const { @@ -1899,6 +2129,12 @@ void _OutputArray::create(int dims, const int* sizes, int mtype, int i, CV_Error(Error::StsNotImplemented, "Unknown/unsupported array type"); } +void _OutputArray::createSameSize(const _InputArray& arr, int mtype) const +{ + int sz[CV_MAX_DIM], d = arr.sizend(sz); + create(d, sz, mtype); +} + void _OutputArray::release() const { CV_Assert(!fixedSize()); @@ -2010,6 +2246,23 @@ cuda::CudaMem& _OutputArray::getCudaMemRef() const return *(cuda::CudaMem*)obj; } +void _OutputArray::setTo(const _InputArray& arr) const +{ + int k = kind(); + + if( k == NONE ) + ; + else if( k == MAT || k == MATX || k == STD_VECTOR ) + { + Mat m = getMat(); + m.setTo(arr); + } + else if( k == UMAT ) + ((UMat*)obj)->setTo(arr); + else + CV_Error(Error::StsNotImplemented, ""); +} + static _InputOutputArray _none; InputOutputArray noArray() { return _none; } diff --git a/modules/core/src/ocl.cpp b/modules/core/src/ocl.cpp index 094a80d974..35927608d0 100644 --- a/modules/core/src/ocl.cpp +++ b/modules/core/src/ocl.cpp @@ -592,9 +592,16 @@ static void* initOpenCLAndLoad(const char* funcname) { if(!initialized) { - handle = dlopen("/System/Library/Frameworks/OpenCL.framework/Versions/Current/OpenCL", RTLD_LAZY); + const char* oclpath = getenv("OPENCV_OPENCL_RUNTIME"); + oclpath = oclpath && strlen(oclpath) > 0 ? oclpath : + "/System/Library/Frameworks/OpenCL.framework/Versions/Current/OpenCL"; + handle = dlopen(oclpath, RTLD_LAZY); initialized = true; g_haveOpenCL = handle != 0 && dlsym(handle, oclFuncToCheck) != 0; + if( g_haveOpenCL ) + fprintf(stderr, "Succesffuly loaded OpenCL v1.1+ runtime from %s\n", oclpath); + else + fprintf(stderr, "Failed to load OpenCL runtime\n"); } if(!handle) return 0; @@ -1212,16 +1219,13 @@ namespace cv { namespace ocl { struct UMat2D { - UMat2D(const UMat& m, int accessFlags) + UMat2D(const UMat& m) { - CV_Assert(m.dims == 2); - data = (cl_mem)m.handle(accessFlags); offset = m.offset; step = m.step; rows = m.rows; cols = m.cols; } - cl_mem data; size_t offset; size_t step; int rows; @@ -1230,10 +1234,8 @@ struct UMat2D struct UMat3D { - UMat3D(const UMat& m, int accessFlags) + UMat3D(const UMat& m) { - CV_Assert(m.dims == 3); - data = (cl_mem)m.handle(accessFlags); offset = m.offset; step = m.step.p[1]; slicestep = m.step.p[0]; @@ -1241,7 +1243,6 @@ struct UMat3D rows = m.size.p[1]; cols = m.size.p[2]; } - cl_mem data; size_t offset; size_t slicestep; size_t step; @@ -1315,7 +1316,7 @@ void setUseOpenCL(bool flag) } } -void finish() +void finish2() { Queue::getDefault().finish(); } @@ -1528,7 +1529,7 @@ String Device::OpenCLVersion() const { return p ? p->getStrProp(CL_DEVICE_EXTENSIONS) : String(); } String Device::driverVersion() const -{ return p ? p->getStrProp(CL_DEVICE_EXTENSIONS) : String(); } +{ return p ? p->getStrProp(CL_DRIVER_VERSION) : String(); } int Device::type() const { return p ? p->getProp(CL_DEVICE_TYPE) : 0; } @@ -1705,14 +1706,14 @@ size_t Device::profilingTimerResolution() const const Device& Device::getDefault() { - const Context& ctx = Context::getDefault(); + const Context2& ctx = Context2::getDefault(); int idx = TLSData::get()->device; return ctx.device(idx); } ///////////////////////////////////////////////////////////////////////////////////////// -struct Context::Impl +struct Context2::Impl { Impl(int dtype0) { @@ -1777,7 +1778,7 @@ struct Context::Impl devices.clear(); } - Program getProg(const ProgramSource& src, + Program getProg(const ProgramSource2& src, const String& buildflags, String& errmsg) { String prefix = Program::getPrefix(buildflags); @@ -1787,7 +1788,8 @@ struct Context::Impl return it->second; //String filename = format("%08x%08x_%08x%08x.clb2", Program prog(src, buildflags, errmsg); - phash.insert(std::pair(k, prog)); + if(prog.ptr()) + phash.insert(std::pair(k, prog)); return prog; } @@ -1797,7 +1799,7 @@ struct Context::Impl std::vector devices; bool initialized; - typedef ProgramSource::hash_t hash_t; + typedef ProgramSource2::hash_t hash_t; struct HashKey { @@ -1812,18 +1814,18 @@ struct Context::Impl }; -Context::Context() +Context2::Context2() { p = 0; } -Context::Context(int dtype) +Context2::Context2(int dtype) { p = 0; create(dtype); } -bool Context::create(int dtype0) +bool Context2::create(int dtype0) { if( !haveOpenCL() ) return false; @@ -1838,19 +1840,19 @@ bool Context::create(int dtype0) return p != 0; } -Context::~Context() +Context2::~Context2() { p->release(); } -Context::Context(const Context& c) +Context2::Context2(const Context2& c) { p = (Impl*)c.p; if(p) p->addref(); } -Context& Context::operator = (const Context& c) +Context2& Context2::operator = (const Context2& c) { Impl* newp = (Impl*)c.p; if(newp) @@ -1861,30 +1863,30 @@ Context& Context::operator = (const Context& c) return *this; } -void* Context::ptr() const +void* Context2::ptr() const { return p->handle; } -size_t Context::ndevices() const +size_t Context2::ndevices() const { return p ? p->devices.size() : 0; } -const Device& Context::device(size_t idx) const +const Device& Context2::device(size_t idx) const { static Device dummy; return !p || idx >= p->devices.size() ? dummy : p->devices[idx]; } -Context& Context::getDefault() +Context2& Context2::getDefault() { - static Context ctx; + static Context2 ctx; if( !ctx.p && haveOpenCL() ) { - // do not create new Context right away. + // do not create new Context2 right away. // First, try to retrieve existing context of the same type. - // In its turn, Platform::getContext() may call Context::create() + // In its turn, Platform::getContext() may call Context2::create() // if there is no such context. ctx.create(Device::TYPE_ACCELERATOR); if(!ctx.p) @@ -1898,7 +1900,7 @@ Context& Context::getDefault() return ctx; } -Program Context::getProg(const ProgramSource& prog, +Program Context2::getProg(const ProgramSource2& prog, const String& buildopts, String& errmsg) { return p ? p->getProg(prog, buildopts, errmsg) : Program(); @@ -1906,14 +1908,14 @@ Program Context::getProg(const ProgramSource& prog, struct Queue::Impl { - Impl(const Context& c, const Device& d) + Impl(const Context2& c, const Device& d) { refcount = 1; - const Context* pc = &c; + const Context2* pc = &c; cl_context ch = (cl_context)pc->ptr(); if( !ch ) { - pc = &Context::getDefault(); + pc = &Context2::getDefault(); ch = (cl_context)pc->ptr(); } cl_device_id dh = (cl_device_id)d.ptr(); @@ -1943,7 +1945,7 @@ Queue::Queue() p = 0; } -Queue::Queue(const Context& c, const Device& d) +Queue::Queue(const Context2& c, const Device& d) { p = 0; create(c, d); @@ -1973,7 +1975,7 @@ Queue::~Queue() p->release(); } -bool Queue::create(const Context& c, const Device& d) +bool Queue::create(const Context2& c, const Device& d) { if(p) p->release(); @@ -1996,7 +1998,7 @@ Queue& Queue::getDefault() { Queue& q = TLSData::get()->oclQueue; if( !q.p ) - q.create(Context::getDefault()); + q.create(Context2::getDefault()); return q; } @@ -2008,15 +2010,20 @@ static cl_command_queue getQueue(const Queue& q) return qq; } -KernelArg::KernelArg(int _flags, UMat* _m, void* _obj, size_t _sz) - : flags(_flags), m(_m), obj(_obj), sz(_sz) +KernelArg::KernelArg() + : flags(0), m(0), obj(0), sz(0), wscale(1) +{ +} + +KernelArg::KernelArg(int _flags, UMat* _m, int _wscale, const void* _obj, size_t _sz) + : flags(_flags), m(_m), obj(_obj), sz(_sz), wscale(_wscale) { } KernelArg KernelArg::Constant(const Mat& m) { CV_Assert(m.isContinuous()); - return KernelArg(CONSTANT, 0, m.data, m.total()*m.elemSize()); + return KernelArg(CONSTANT, 0, 1, m.data, m.total()*m.elemSize()); } @@ -2099,8 +2106,8 @@ Kernel::Kernel(const char* kname, const Program& prog) create(kname, prog); } -Kernel::Kernel(const char* kname, const ProgramSource& src, - const String& buildopts, String& errmsg) +Kernel::Kernel(const char* kname, const ProgramSource2& src, + const String& buildopts, String* errmsg) { p = 0; create(kname, src, buildopts, errmsg); @@ -2143,15 +2150,17 @@ bool Kernel::create(const char* kname, const Program& prog) return p != 0; } -bool Kernel::create(const char* kname, const ProgramSource& src, - const String& buildopts, String& errmsg) +bool Kernel::create(const char* kname, const ProgramSource2& src, + const String& buildopts, String* errmsg) { if(p) { p->release(); p = 0; } - const Program& prog = Context::getDefault().getProg(src, buildopts, errmsg); + String tempmsg; + if( !errmsg ) errmsg = &tempmsg; + const Program& prog = Context2::getDefault().getProg(src, buildopts, *errmsg); return create(kname, prog); } @@ -2160,55 +2169,91 @@ void* Kernel::ptr() const return p ? p->handle : 0; } -void Kernel::set(int i, const void* value, size_t sz) +bool Kernel::empty() const { - CV_Assert( p && clSetKernelArg(p->handle, (cl_uint)i, sz, value) >= 0 ); + return ptr() == 0; +} + +int Kernel::set(int i, const void* value, size_t sz) +{ + CV_Assert(i >= 0); if( i == 0 ) p->cleanupUMats(); + if( !p || !p->handle || clSetKernelArg(p->handle, (cl_uint)i, sz, value) < 0 ) + return -1; + return i+1; } -void Kernel::set(int i, const UMat& m) +int Kernel::set(int i, const UMat& m) { - set(i, KernelArg(KernelArg::READ_WRITE, (UMat*)&m, 0, 0)); + return set(i, KernelArg(KernelArg::READ_WRITE, (UMat*)&m, 0, 0)); } -void Kernel::set(int i, const KernelArg& arg) +int Kernel::set(int i, const KernelArg& arg) { - CV_Assert( p && p->handle ); + CV_Assert( i >= 0 ); if( i == 0 ) p->cleanupUMats(); + if( !p || !p->handle ) + return -1; if( arg.m ) { int accessFlags = ((arg.flags & KernelArg::READ_ONLY) ? ACCESS_READ : 0) + ((arg.flags & KernelArg::WRITE_ONLY) ? ACCESS_WRITE : 0); + cl_mem h = (cl_mem)arg.m->handle(accessFlags); + if( arg.m->dims <= 2 ) { - UMat2D u2d(*arg.m, accessFlags); - clSetKernelArg(p->handle, (cl_uint)i, sizeof(u2d), &u2d); + UMat2D u2d(*arg.m); + clSetKernelArg(p->handle, (cl_uint)i, sizeof(h), &h); + clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u2d.step), &u2d.step); + clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u2d.offset), &u2d.offset); + i += 3; + + if( !(arg.flags & KernelArg::NO_SIZE) ) + { + int cols = u2d.cols*arg.wscale; + clSetKernelArg(p->handle, (cl_uint)i, sizeof(u2d.rows), &u2d.rows); + clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u2d.cols), &cols); + i += 2; + } } else { - UMat3D u3d(*arg.m, accessFlags); - clSetKernelArg(p->handle, (cl_uint)i, sizeof(u3d), &u3d); + UMat3D u3d(*arg.m); + clSetKernelArg(p->handle, (cl_uint)i, sizeof(h), &h); + clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.slicestep), &u3d.slicestep); + clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.step), &u3d.step); + clSetKernelArg(p->handle, (cl_uint)(i+3), sizeof(u3d.offset), &u3d.offset); + i += 4; + if( !(arg.flags & KernelArg::NO_SIZE) ) + { + int cols = u3d.cols*arg.wscale; + clSetKernelArg(p->handle, (cl_uint)i, sizeof(u3d.slices), &u3d.rows); + clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.rows), &u3d.rows); + clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.cols), &cols); + i += 3; + } } p->addUMat(*arg.m); + return i; } - else - { - clSetKernelArg(p->handle, (cl_uint)i, arg.sz, arg.obj); - } + clSetKernelArg(p->handle, (cl_uint)i, arg.sz, arg.obj); + return i+1; } -void Kernel::run(int dims, size_t offset[], size_t globalsize[], size_t localsize[], +bool Kernel::run(int dims, size_t globalsize[], size_t localsize[], bool sync, const Queue& q) { - CV_Assert(p && p->handle && p->e == 0); + if(!p || !p->handle || p->e != 0) + return false; cl_command_queue qq = getQueue(q); - clEnqueueNDRangeKernel(qq, p->handle, (cl_uint)dims, - offset, globalsize, localsize, 0, 0, - sync ? 0 : &p->e); - if( sync ) + size_t offset[CV_MAX_DIM] = {0}; + cl_int retval = clEnqueueNDRangeKernel(qq, p->handle, (cl_uint)dims, + offset, globalsize, localsize, 0, 0, + sync ? 0 : &p->e); + if( sync || retval < 0 ) { clFinish(qq); p->cleanupUMats(); @@ -2218,14 +2263,17 @@ void Kernel::run(int dims, size_t offset[], size_t globalsize[], size_t localsiz p->addref(); clSetEventCallback(p->e, CL_COMPLETE, oclCleanupCallback, p); } + return retval >= 0; } -void Kernel::runTask(bool sync, const Queue& q) +bool Kernel::runTask(bool sync, const Queue& q) { - CV_Assert(p && p->handle && p->e == 0); + if(!p || !p->handle || p->e != 0) + return false; + cl_command_queue qq = getQueue(q); - clEnqueueTask(qq, p->handle, 0, 0, sync ? 0 : &p->e); - if( sync ) + cl_int retval = clEnqueueTask(qq, p->handle, 0, 0, sync ? 0 : &p->e); + if( sync || retval < 0 ) { clFinish(qq); p->cleanupUMats(); @@ -2235,6 +2283,7 @@ void Kernel::runTask(bool sync, const Queue& q) p->addref(); clSetEventCallback(p->e, CL_COMPLETE, oclCleanupCallback, p); } + return retval >= 0; } @@ -2273,11 +2322,11 @@ size_t Kernel::localMemSize() const struct Program::Impl { - Impl(const ProgramSource& _src, + Impl(const ProgramSource2& _src, const String& _buildflags, String& errmsg) { refcount = 1; - const Context& ctx = Context::getDefault(); + const Context2& ctx = Context2::getDefault(); src = _src; buildflags = _buildflags; const String& srcstr = src.source(); @@ -2293,17 +2342,20 @@ struct Program::Impl void** deviceList = deviceListBuf; for( i = 0; i < n; i++ ) deviceList[i] = ctx.device(i).ptr(); + printf("Building the OpenCL program ...\n"); retval = clBuildProgram(handle, n, (const cl_device_id*)deviceList, buildflags.c_str(), 0, 0); if( retval == CL_BUILD_PROGRAM_FAILURE ) { - char buf[1024]; + char buf[1<<16]; size_t retsz = 0; clGetProgramBuildInfo(handle, (cl_device_id)deviceList[0], CL_PROGRAM_BUILD_LOG, sizeof(buf)-16, buf, &retsz); errmsg = String(buf); + CV_Error_(Error::StsAssert, ("OpenCL program can not be built: %s", errmsg.c_str())); } + CV_Assert(retval >= 0); } } @@ -2315,7 +2367,7 @@ struct Program::Impl if(_buf.empty()) return; String prefix0 = Program::getPrefix(buildflags); - const Context& ctx = Context::getDefault(); + const Context2& ctx = Context2::getDefault(); const Device& dev = Device::getDefault(); const char* pos0 = _buf.c_str(); const char* pos1 = strchr(pos0, '\n'); @@ -2366,7 +2418,7 @@ struct Program::Impl IMPLEMENT_REFCOUNTABLE(); - ProgramSource src; + ProgramSource2 src; String buildflags; cl_program handle; }; @@ -2374,7 +2426,7 @@ struct Program::Impl Program::Program() { p = 0; } -Program::Program(const ProgramSource& src, +Program::Program(const ProgramSource2& src, const String& buildflags, String& errmsg) { p = 0; @@ -2405,7 +2457,7 @@ Program::~Program() p->release(); } -bool Program::create(const ProgramSource& src, +bool Program::create(const ProgramSource2& src, const String& buildflags, String& errmsg) { if(p) @@ -2419,9 +2471,9 @@ bool Program::create(const ProgramSource& src, return p != 0; } -const ProgramSource& Program::source() const +const ProgramSource2& Program::source() const { - static ProgramSource dummy; + static ProgramSource2 dummy; return p ? p->src : dummy; } @@ -2455,7 +2507,7 @@ String Program::getPrefix() const String Program::getPrefix(const String& buildflags) { - const Context& ctx = Context::getDefault(); + const Context2& ctx = Context2::getDefault(); const Device& dev = ctx.device(0); return format("name=%s\ndriver=%s\nbuildflags=%s\n", dev.name().c_str(), dev.driverVersion().c_str(), buildflags.c_str()); @@ -2463,7 +2515,7 @@ String Program::getPrefix(const String& buildflags) //////////////////////////////////////////////////////////////////////////////////////// -struct ProgramSource::Impl +struct ProgramSource2::Impl { Impl(const char* _src) { @@ -2482,39 +2534,39 @@ struct ProgramSource::Impl IMPLEMENT_REFCOUNTABLE(); String src; - ProgramSource::hash_t h; + ProgramSource2::hash_t h; }; -ProgramSource::ProgramSource() +ProgramSource2::ProgramSource2() { p = 0; } -ProgramSource::ProgramSource(const char* prog) +ProgramSource2::ProgramSource2(const char* prog) { p = new Impl(prog); } -ProgramSource::ProgramSource(const String& prog) +ProgramSource2::ProgramSource2(const String& prog) { p = new Impl(prog); } -ProgramSource::~ProgramSource() +ProgramSource2::~ProgramSource2() { if(p) p->release(); } -ProgramSource::ProgramSource(const ProgramSource& prog) +ProgramSource2::ProgramSource2(const ProgramSource2& prog) { p = prog.p; if(p) p->addref(); } -ProgramSource& ProgramSource::operator = (const ProgramSource& prog) +ProgramSource2& ProgramSource2::operator = (const ProgramSource2& prog) { Impl* newp = (Impl*)prog.p; if(newp) @@ -2525,13 +2577,13 @@ ProgramSource& ProgramSource::operator = (const ProgramSource& prog) return *this; } -const String& ProgramSource::source() const +const String& ProgramSource2::source() const { static String dummy; return p ? p->src : dummy; } -ProgramSource::hash_t ProgramSource::hash() const +ProgramSource2::hash_t ProgramSource2::hash() const { return p ? p->h : 0; } @@ -2551,7 +2603,7 @@ public: return u; } - void getBestFlags(const Context& ctx, int& createFlags, int& flags0) const + void getBestFlags(const Context2& ctx, int& createFlags, int& flags0) const { const Device& dev = ctx.device(0); createFlags = CL_MEM_READ_WRITE; @@ -2574,7 +2626,7 @@ public: total *= sizes[i]; } - Context& ctx = Context::getDefault(); + Context2& ctx = Context2::getDefault(); int createFlags = 0, flags0 = 0; getBestFlags(ctx, createFlags, flags0); @@ -2603,7 +2655,7 @@ public: if(u->handle == 0) { CV_Assert(u->origdata != 0); - Context& ctx = Context::getDefault(); + Context2& ctx = Context2::getDefault(); int createFlags = 0, flags0 = 0; getBestFlags(ctx, createFlags, flags0); @@ -2848,7 +2900,6 @@ public: new_srcofs, new_dstofs, new_sz, new_srcstep[0], new_srcstep[1], new_dststep[0], new_dststep[1], dstptr, 0, 0, 0) >= 0 ); } - clFinish(q); } void upload(UMatData* u, const void* srcptr, int dims, const size_t sz[], @@ -2890,6 +2941,9 @@ public: if( iscontinuous ) { + int crc = 0; + for( size_t i = 0; i < total; i++ ) + crc ^= ((uchar*)srcptr)[i]; CV_Assert( clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE, dstrawofs, total, srcptr, 0, 0, 0) >= 0 ); } @@ -2949,10 +3003,11 @@ public: } else { - CV_Assert( clEnqueueCopyBufferRect(q, (cl_mem)src->handle, (cl_mem)dst->handle, + cl_int retval; + CV_Assert( (retval = clEnqueueCopyBufferRect(q, (cl_mem)src->handle, (cl_mem)dst->handle, new_srcofs, new_dstofs, new_sz, new_srcstep[0], new_srcstep[1], new_dststep[0], new_dststep[1], - 0, 0, 0) >= 0 ); + 0, 0, 0)) >= 0 ); } dst->markHostCopyObsolete(true); @@ -2969,4 +3024,61 @@ MatAllocator* getOpenCLAllocator() return &allocator; } +const char* typeToStr(int t) +{ + static const char* tab[]= + { + "uchar", "uchar2", "uchar3", "uchar4", + "char", "char2", "char3", "char4", + "ushort", "ushort2", "ushort3", "ushort4", + "short", "short2", "short3", "short4", + "int", "int2", "int3", "int4", + "float", "float2", "float3", "float4", + "double", "double2", "double3", "double4", + "?", "?", "?", "?" + }; + int cn = CV_MAT_CN(t); + return cn >= 4 ? "?" : tab[CV_MAT_DEPTH(t)*4 + cn-1]; +} + +const char* memopTypeToStr(int t) +{ + static const char* tab[]= + { + "uchar", "uchar2", "uchar3", "uchar4", + "uchar", "uchar2", "uchar3", "uchar4", + "ushort", "ushort2", "ushort3", "ushort4", + "ushort", "ushort2", "ushort3", "ushort4", + "int", "int2", "int3", "int4", + "int", "int2", "int3", "int4", + "long", "long2", "long3", "long4", + "?", "?", "?", "?" + }; + int cn = CV_MAT_CN(t); + return cn >= 4 ? "?" : tab[CV_MAT_DEPTH(t)*4 + cn-1]; +} + +const char* convertTypeStr(int sdepth, int ddepth, int cn, char* buf) +{ + if( sdepth == ddepth ) + return "noconvert"; + const char *typestr = typeToStr(CV_MAKETYPE(ddepth, cn)); + if( ddepth >= CV_32F || + (ddepth == CV_32S && sdepth < CV_32S) || + (ddepth == CV_16S && sdepth <= CV_8S) || + (ddepth == CV_16U && sdepth == CV_8U)) + { + sprintf(buf, "convert_%s", typestr); + } + else if( sdepth >= CV_32F ) + { + sprintf(buf, "convert_%s%s_rte", typestr, (ddepth < CV_32S ? "_sat" : "")); + } + else + { + sprintf(buf, "convert_%s_sat", typestr); + } + return buf; +} + }} diff --git a/modules/core/src/opencl/arithm.cl b/modules/core/src/opencl/arithm.cl new file mode 100644 index 0000000000..90fbace2d8 --- /dev/null +++ b/modules/core/src/opencl/arithm.cl @@ -0,0 +1,307 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Copyright (C) 2013, OpenCV Foundation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Jia Haipeng, jiahaipeng95@gmail.com +// +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors as is and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the copyright holders or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +/* + Usage: + after compiling this program user gets a single kernel called KF. + the following flags should be passed: + 1) one of "-D BINARY_OP", "-D UNARY_OP", "-D MASK_BINARY_OP" or "-D MASK_UNARY_OP" + 2) the actual operation performed, one of "-D OP_...", see below the list of operations. + 2a) "-D dstDepth= [-D cn= -D srcDepth2= -D dstDepth= + -D workDepth= [-D cn=]" - for mixed-type operations +*/ + +#if defined (DOUBLE_SUPPORT) +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64:enable +#elif defined (cl_amd_fp64) +#pragma OPENCL EXTENSION cl_amd_fp64:enable +#endif +#endif + +#define CV_32S 4 +#define CV_32F 5 + +#define dstelem *(dstT*)(dstptr + dst_index) +#define noconvert(x) x + +#ifndef workT + + #define srcT1 dstT + #define srcT2 dstT + #define workT dstT + #define srcelem1 *(dstT*)(srcptr1 + src1_index) + #define srcelem2 *(dstT*)(srcptr2 + src2_index) + #define convertToDT noconvert + +#else + + #define srcelem1 convertToWT1(*(srcT1*)(srcptr1 + src1_index)) + #define srcelem2 convertToWT2(*(srcT2*)(srcptr2 + src2_index)) + +#endif + +#define EXTRA_PARAMS + +#if defined OP_ADD_SAT +#define PROCESS_ELEM dstelem = add_sat(srcelem1, srcelem2) + +#elif defined OP_ADD +#define PROCESS_ELEM dstelem = convertToDT(srcelem1 + srcelem2) + +#elif defined OP_SUB_SAT +#define PROCESS_ELEM dstelem = sub_sat(srcelem1, srcelem2) + +#elif defined OP_SUB +#define PROCESS_ELEM dstelem = convertToDT(srcelem1 - srcelem2) + +#elif defined OP_RSUB_SAT +#define PROCESS_ELEM dstelem = sub_sat(srcelem2, srcelem1) + +#elif defined OP_RSUB +#define PROCESS_ELEM dstelem = convertToDT(srcelem2 - srcelem1) + +#elif defined OP_ABSDIFF +#define PROCESS_ELEM dstelem = abs_diff(srcelem1, srcelem2) + +#elif defined OP_AND +#define PROCESS_ELEM dstelem = srcelem1 & srcelem2 + +#elif defined OP_OR +#define PROCESS_ELEM dstelem = srcelem1 | srcelem2 + +#elif defined OP_XOR +#define PROCESS_ELEM dstelem = srcelem1 ^ srcelem2 + +#elif defined OP_NOT +#define PROCESS_ELEM dstelem = ~srcelem1 + +#elif defined OP_MIN +#define PROCESS_ELEM dstelem = min(srcelem1, srcelem2) + +#elif defined OP_MAX +#define PROCESS_ELEM dstelem = max(srcelem1, srcelem2) + +#elif defined OP_MUL +#define PROCESS_ELEM dstelem = convertToDT(srcelem1 * srcelem2) + +#elif defined OP_MUL_SCALE +#undef EXTRA_PARAMS +#define EXTRA_PARAMS , workT scale +#define PROCESS_ELEM dstelem = convertToDT(srcelem1 * srcelem2 * scale) + +#elif defined OP_DIV +#define PROCESS_ELEM \ + workT e2 = srcelem2, zero = (workT)(0); \ + dstelem = convertToDT(e2 != zero ? srcelem1 / e2 : zero) + +#elif defined OP_DIV_SCALE +#undef EXTRA_PARAMS +#define EXTRA_PARAMS , workT scale +#define PROCESS_ELEM \ + workT e2 = srcelem2, zero = (workT)(0); \ + dstelem = convertToDT(e2 != zero ? srcelem1 * scale / e2 : zero) + +#elif defined OP_RECIP_SCALE +#undef EXTRA_PARAMS +#define EXTRA_PARAMS , workT scale +#define PROCESS_ELEM \ + workT e1 = srcelem1, zero = (workT)(0); \ + dstelem = convertToDT(e1 != zero ? scale / e1 : zero) + +#elif defined OP_ADDW +#undef EXTRA_PARAMS +#define EXTRA_PARAMS , workT alpha, workT beta, workT gamma +#define PROCESS_ELEM dstelem = convertToDT(srcelem1*alpha + srcelem2*beta + gamma) + +#elif defined OP_MAG +#define PROCESS_ELEM dstelem = hypot(srcelem1, srcelem2) + +#elif defined OP_PHASE_RADIANS +#define PROCESS_ELEM \ + workT tmp = atan2(srcelem2, srcelem1); \ + if(tmp < 0) tmp += 6.283185307179586232; \ + dstelem = tmp + +#elif defined OP_PHASE_DEGREES + #define PROCESS_ELEM \ + workT tmp = atan2(srcelem2, srcelem1)*57.29577951308232286465; \ + if(tmp < 0) tmp += 360; \ + dstelem = tmp + +#elif defined OP_EXP +#define PROCESS_ELEM dstelem = exp(srcelem1) + +#elif defined OP_SQRT +#define PROCESS_ELEM dstelem = sqrt(srcelem1) + +#elif defined OP_LOG +#define PROCESS_ELEM dstelem = log(abs(srcelem1)) + +#elif defined OP_CMP +#define PROCESS_ELEM dstelem = convert_uchar(srcelem1 CMP_OPERATOR srcelem2 ? 255 : 0) + +#elif defined OP_CONVERT +#define PROCESS_ELEM dstelem = convertToDT(srcelem1) + +#elif defined OP_CONVERT_SCALE +#undef EXTRA_PARAMS +#define EXTRA_PARAMS , workT alpha, workT beta +#define PROCESS_ELEM dstelem = convertToDT(srcelem1*alpha + beta) + +#else +#error "unknown op type" +#endif + +#if defined UNARY_OP || defined MASK_UNARY_OP +#undef srcelem2 +#if defined OP_AND || defined OP_OR || defined OP_XOR || defined OP_ADD || defined OP_SAT_ADD || \ + defined OP_SUB || defined OP_SAT_SUB || defined OP_RSUB || defined OP_SAT_RSUB || \ + defined OP_ABSDIFF || defined OP_CMP || defined OP_MIN || defined OP_MAX + #undef EXTRA_PARAMS + #define EXTRA_PARAMS , workT srcelem2 +#endif +#endif + +#if defined BINARY_OP + +__kernel void KF(__global const uchar* srcptr1, int srcstep1, int srcoffset1, + __global const uchar* srcptr2, int srcstep2, int srcoffset2, + __global uchar* dstptr, int dststep, int dstoffset, + int rows, int cols EXTRA_PARAMS ) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + if (x < cols && y < rows) + { + int src1_index = mad24(y, srcstep1, x*sizeof(srcT1) + srcoffset1); + int src2_index = mad24(y, srcstep2, x*sizeof(srcT2) + srcoffset2); + int dst_index = mad24(y, dststep, x*sizeof(dstT) + dstoffset); + + PROCESS_ELEM; + //printf("(x=%d, y=%d). %d, %d, %d\n", x, y, (int)srcelem1, (int)srcelem2, (int)dstelem); + } +} + +#elif defined MASK_BINARY_OP + +__kernel void KF(__global const uchar* srcptr1, int srcstep1, int srcoffset1, + __global const uchar* srcptr2, int srcstep2, int srcoffset2, + __global const uchar* mask, int maskstep, int maskoffset, + __global uchar* dstptr, int dststep, int dstoffset, + int rows, int cols EXTRA_PARAMS ) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + if (x < cols && y < rows) + { + int mask_index = mad24(y, maskstep, x + maskoffset); + if( mask[mask_index] ) + { + int src1_index = mad24(y, srcstep1, x*sizeof(srcT1) + srcoffset1); + int src2_index = mad24(y, srcstep2, x*sizeof(srcT2) + srcoffset2); + int dst_index = mad24(y, dststep, x*sizeof(dstT) + dstoffset); + + PROCESS_ELEM; + } + } +} + +#elif defined UNARY_OP + +__kernel void KF(__global const uchar* srcptr1, int srcstep1, int srcoffset1, + __global uchar* dstptr, int dststep, int dstoffset, + int rows, int cols EXTRA_PARAMS ) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + if (x < cols && y < rows) + { + int src1_index = mad24(y, srcstep1, x*sizeof(srcT1) + srcoffset1); + int dst_index = mad24(y, dststep, x*sizeof(dstT) + dstoffset); + + PROCESS_ELEM; + } +} + +#elif defined MASK_UNARY_OP + +__kernel void KF(__global const uchar* srcptr1, int srcstep1, int srcoffset1, + __global const uchar* mask, int maskstep, int maskoffset, + __global uchar* dstptr, int dststep, int dstoffset, + int rows, int cols EXTRA_PARAMS ) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + if (x < cols && y < rows) + { + int mask_index = mad24(y, maskstep, x + maskoffset); + if( mask[mask_index] ) + { + int src1_index = mad24(y, srcstep1, x*sizeof(srcT1) + srcoffset1); + int dst_index = mad24(y, dststep, x*sizeof(dstT) + dstoffset); + + PROCESS_ELEM; + } + } +} + +#else + +#error "Unknown operation type" + +#endif + + + + diff --git a/modules/core/src/opencl/copyset.cl b/modules/core/src/opencl/copyset.cl new file mode 100644 index 0000000000..9b736162f5 --- /dev/null +++ b/modules/core/src/opencl/copyset.cl @@ -0,0 +1,74 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Copyright (C) 2013, OpenCV Foundation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors as is and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the copyright holders or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +__kernel void setMask(__global const uchar* mask, int maskstep, int maskoffset, + __global uchar* dstptr, int dststep, int dstoffset, + int rows, int cols, dstT value ) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + if (x < cols && y < rows) + { + int mask_index = mad24(y, maskstep, x + maskoffset); + if( mask[mask_index] ) + { + int dst_index = mad24(y, dststep, x*sizeof(dstT) + dstoffset); + *(dstT*)(dstptr + dst_index) = value; + } + } +} + +__kernel void set(__global uchar* dstptr, int dststep, int dstoffset, + int rows, int cols, dstT value ) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + if (x < cols && y < rows) + { + int dst_index = mad24(y, dststep, x*sizeof(dstT) + dstoffset); + *(dstT*)(dstptr + dst_index) = value; + } +} + diff --git a/modules/core/src/opencl/mulspectrums.cl b/modules/core/src/opencl/mulspectrums.cl new file mode 100644 index 0000000000..86d4e5d520 --- /dev/null +++ b/modules/core/src/opencl/mulspectrums.cl @@ -0,0 +1,96 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Peng Xiao, pengxiao@multicorewareinc.com +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other oclMaterials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors as is and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the uintel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business uinterruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +typedef float2 cfloat; +inline cfloat cmulf(cfloat a, cfloat b) +{ + return (cfloat)( a.x*b.x - a.y*b.y, a.x*b.y + a.y*b.x); +} + +inline cfloat conjf(cfloat a) +{ + return (cfloat)( a.x, - a.y ); +} + +__kernel void +mulAndScaleSpectrumsKernel( + __global const cfloat* a, + __global const cfloat* b, + float scale, + __global cfloat* dst, + uint cols, + uint rows, + uint mstep +) +{ + const uint x = get_global_id(0); + const uint y = get_global_id(1); + const uint idx = mad24(y, mstep / sizeof(cfloat), x); + if (x < cols && y < rows) + { + cfloat v = cmulf(a[idx], b[idx]); + dst[idx] = (cfloat)( v.x * scale, v.y * scale ); + } +} +__kernel void +mulAndScaleSpectrumsKernel_CONJ( + __global const cfloat* a, + __global const cfloat* b, + float scale, + __global cfloat* dst, + uint cols, + uint rows, + uint mstep +) +{ + const uint x = get_global_id(0); + const uint y = get_global_id(1); + const uint idx = mad24(y, mstep / sizeof(cfloat), x); + if (x < cols && y < rows) + { + cfloat v = cmulf(a[idx], conjf(b[idx])); + dst[idx] = (cfloat)( v.x * scale, v.y * scale ); + } +} diff --git a/modules/core/src/opencl/polarcart.cl b/modules/core/src/opencl/polarcart.cl new file mode 100644 index 0000000000..1883df7804 --- /dev/null +++ b/modules/core/src/opencl/polarcart.cl @@ -0,0 +1,73 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Copyright (C) 2013, OpenCV Foundation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors as is and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the copyright holders or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +__kernel void polarToCart(__global const uchar* mask, int maskstep, int maskoffset, + __global uchar* dstptr, int dststep, int dstoffset, + int rows, int cols, dstT value ) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + if (x < cols && y < rows) + { + int mask_index = mad24(y, maskstep, x + maskoffset); + if( mask[mask_index] ) + { + int dst_index = mad24(y, dststep, x*sizeof(dstT) + dstoffset); + *(dstT*)(dstptr + dst_index) = value; + } + } +} + +__kernel void cartToPolar(__global uchar* dstptr, int dststep, int dstoffset, + int rows, int cols, dstT value ) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + if (x < cols && y < rows) + { + int dst_index = mad24(y, dststep, x*sizeof(dstT) + dstoffset); + *(dstT*)(dstptr + dst_index) = value; + } +} diff --git a/modules/core/src/opencl/reductions.cl b/modules/core/src/opencl/reductions.cl new file mode 100644 index 0000000000..6eb6e48323 --- /dev/null +++ b/modules/core/src/opencl/reductions.cl @@ -0,0 +1,104 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Shengen Yan,yanshengen@gmail.com +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors as is and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if defined (DOUBLE_SUPPORT) +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64:enable +#elif defined (cl_amd_fp64) +#pragma OPENCL EXTENSION cl_amd_fp64:enable +#endif +#endif + +#if FUNC_SUM +#define FUNC(a, b) b += a; +#elif FUNC_ABS_SUM +#define FUNC(a, b) b += a >= (dstT)(0) ? a : -a; +#elif FUNC_SQR_SUM +#define FUNC(a, b) b += a * a; +#else +#error No sum function +#endif + +/**************************************Array buffer SUM**************************************/ + +__kernel void arithm_op_sum(int cols,int invalid_cols,int offset,int elemnum,int groupnum, + __global srcT *src, __global dstT *dst) +{ + unsigned int lid = get_local_id(0); + unsigned int gid = get_group_id(0); + unsigned int id = get_global_id(0); + unsigned int idx = offset + id + (id / cols) * invalid_cols; + + __local dstT localmem_sum[128]; + dstT sum = (dstT)(0), temp; + + for (int grainSize = groupnum << 8; id < elemnum; id += grainSize) + { + idx = offset + id + (id / cols) * invalid_cols; + temp = convertToDstT(src[idx]); + FUNC(temp, sum); + } + + if (lid > 127) + localmem_sum[lid - 128] = sum; + barrier(CLK_LOCAL_MEM_FENCE); + + if (lid < 128) + localmem_sum[lid] = sum + localmem_sum[lid]; + barrier(CLK_LOCAL_MEM_FENCE); + + for (int lsize = 64; lsize > 0; lsize >>= 1) + { + if (lid < lsize) + { + int lid2 = lsize + lid; + localmem_sum[lid] = localmem_sum[lid] + localmem_sum[lid2]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + if (lid == 0) + dst[gid] = localmem_sum[0]; +} diff --git a/modules/core/src/precomp.hpp b/modules/core/src/precomp.hpp index 073a54e034..faffb9a9d7 100644 --- a/modules/core/src/precomp.hpp +++ b/modules/core/src/precomp.hpp @@ -67,6 +67,8 @@ #define GET_OPTIMIZED(func) (func) #endif +#include "opencl_kernels.hpp" + namespace cv { @@ -205,13 +207,30 @@ enum { BLOCK_SIZE = 1024 }; inline bool checkScalar(const Mat& sc, int atype, int sckind, int akind) { - if( sc.dims > 2 || (sc.cols != 1 && sc.rows != 1) || !sc.isContinuous() ) + if( sc.dims > 2 || !sc.isContinuous() ) + return false; + Size sz = sc.size(); + if(sz.width != 1 && sz.height != 1) + return false; + int cn = CV_MAT_CN(atype); + if( akind == _InputArray::MATX && sckind != _InputArray::MATX ) + return false; + return sz == Size(1, 1) || sz == Size(1, cn) || sz == Size(cn, 1) || + (sz == Size(1, 4) && sc.type() == CV_64F && cn <= 4); +} + +inline bool checkScalar(InputArray sc, int atype, int sckind, int akind) +{ + if( sc.dims() > 2 || !sc.isContinuous() ) + return false; + Size sz = sc.size(); + if(sz.width != 1 && sz.height != 1) return false; int cn = CV_MAT_CN(atype); if( akind == _InputArray::MATX && sckind != _InputArray::MATX ) return false; - return sc.size() == Size(1, 1) || sc.size() == Size(1, cn) || sc.size() == Size(cn, 1) || - (sc.size() == Size(1, 4) && sc.type() == CV_64F && cn <= 4); + return sz == Size(1, 1) || sz == Size(1, cn) || sz == Size(cn, 1) || + (sz == Size(1, 4) && sc.type() == CV_64F && cn <= 4); } void convertAndUnrollScalar( const Mat& sc, int buftype, uchar* scbuf, size_t blocksize ); @@ -227,7 +246,10 @@ struct TLSData static TLSData* get(); }; -namespace ocl { MatAllocator* getOpenCLAllocator(); } +namespace ocl +{ + MatAllocator* getOpenCLAllocator(); +} } diff --git a/modules/core/src/umatrix.cpp b/modules/core/src/umatrix.cpp index 2ea71acc8b..46f03b97b8 100644 --- a/modules/core/src/umatrix.cpp +++ b/modules/core/src/umatrix.cpp @@ -197,6 +197,7 @@ UMat Mat::getUMat(int accessFlags) const if(!u) return hdr; UMat::getStdAllocator()->allocate(u, accessFlags); + hdr.flags = flags; setSize(hdr, dims, size.p, step.p); finalizeHdr(hdr); hdr.u = u; @@ -548,7 +549,8 @@ Mat UMat::getMat(int accessFlags) const CV_Assert(u->data != 0); Mat hdr(dims, size.p, type(), u->data + offset, step.p); hdr.u = u; - hdr.datastart = hdr.data = u->data; + hdr.datastart = u->data; + hdr.data = hdr.datastart + offset; hdr.datalimit = hdr.dataend = u->data + u->size; CV_XADD(&hdr.u->refcount, 1); return hdr; @@ -617,7 +619,7 @@ void UMat::copyTo(OutputArray _dst) const void* dsthandle = dst.handle(ACCESS_WRITE); if( srchandle == dsthandle && dst.offset == offset ) return; - ndoffset(dstofs); + dst.ndoffset(dstofs); CV_Assert(u->currAllocator == dst.u->currAllocator); u->currAllocator->copy(u, dst.u, dims, sz, srcofs, step.p, dstofs, dst.step.p, false); } @@ -633,6 +635,50 @@ void UMat::convertTo(OutputArray, int, double, double) const CV_Error(Error::StsNotImplemented, ""); } +UMat& UMat::setTo(InputArray _value, InputArray _mask) +{ + bool haveMask = !_mask.empty(); + int t = type(), cn = CV_MAT_CN(t); + if( dims <= 2 && cn <= 4 && ocl::useOpenCL() ) + { + Mat value = _value.getMat(); + CV_Assert( checkScalar(value, type(), _value.kind(), _InputArray::UMAT) ); + double buf[4]; + convertAndUnrollScalar(value, t, (uchar*)buf, 1); + + char opts[1024]; + sprintf(opts, "-D dstT=%s", ocl::memopTypeToStr(t)); + + ocl::Kernel setK(haveMask ? "setMask" : "set", ocl::core::copyset_oclsrc, opts); + if( !setK.empty() ) + { + ocl::KernelArg scalararg(0, 0, 0, buf, CV_ELEM_SIZE(t)); + UMat mask; + + if( haveMask ) + { + mask = _mask.getUMat(); + CV_Assert( mask.size() == size() && mask.type() == CV_8U ); + ocl::KernelArg maskarg = ocl::KernelArg::ReadOnlyNoSize(mask); + ocl::KernelArg dstarg = ocl::KernelArg::ReadWrite(*this); + setK.args(maskarg, dstarg, scalararg); + } + else + { + ocl::KernelArg dstarg = ocl::KernelArg::WriteOnly(*this); + setK.args(dstarg, scalararg); + } + + size_t globalsize[] = { cols, rows }; + if( setK.run(2, globalsize, 0, false) ) + return *this; + } + } + Mat m = getMat(haveMask ? ACCESS_RW : ACCESS_WRITE); + m.setTo(_value, _mask); + return *this; +} + UMat& UMat::operator = (const Scalar&) { CV_Error(Error::StsNotImplemented, ""); diff --git a/modules/core/test/test_umat.cpp b/modules/core/test/test_umat.cpp index 56ec72c7a2..0b011209a9 100644 --- a/modules/core/test/test_umat.cpp +++ b/modules/core/test/test_umat.cpp @@ -91,11 +91,11 @@ bool CV_UMatTest::TestUMat() { try { - Mat a(100, 100, CV_16S), b; + Mat a(100, 100, CV_16SC2), b, c; randu(a, Scalar::all(-100), Scalar::all(100)); - Rect roi(1, 3, 10, 20); - Mat ra(a, roi), rb; - UMat ua, ura; + Rect roi(1, 3, 5, 4); + Mat ra(a, roi), rb, rc, rc0; + UMat ua, ura, ub, urb, uc, urc; a.copyTo(ua); ua.copyTo(b); CHECK_DIFF(a, b); @@ -112,6 +112,71 @@ bool CV_UMatTest::TestUMat() } ra.copyTo(rb); CHECK_DIFF(ra, rb); + + b = a.clone(); + ra = a(roi); + rb = b(roi); + randu(b, Scalar::all(-100), Scalar::all(100)); + b.copyTo(ub); + urb = ub(roi); + + /*std::cout << "==============================================\nbefore op (CPU):\n"; + std::cout << "ra: " << ra << std::endl; + std::cout << "rb: " << rb << std::endl;*/ + + ra.copyTo(ura); + rb.copyTo(urb); + ra.release(); + rb.release(); + ura.copyTo(ra); + urb.copyTo(rb); + + /*std::cout << "==============================================\nbefore op (GPU):\n"; + std::cout << "ra: " << ra << std::endl; + std::cout << "rb: " << rb << std::endl;*/ + + cv::max(ra, rb, rc); + cv::max(ura, urb, urc); + urc.copyTo(rc0); + + /*std::cout << "==============================================\nafter op:\n"; + std::cout << "rc: " << rc << std::endl; + std::cout << "rc0: " << rc0 << std::endl;*/ + + CHECK_DIFF(rc0, rc); + + { + UMat tmp = rc0.getUMat(ACCESS_WRITE); + cv::max(ura, urb, tmp); + } + CHECK_DIFF(rc0, rc); + + ura.copyTo(urc); + cv::max(urc, urb, urc); + urc.copyTo(rc0); + CHECK_DIFF(rc0, rc); + + rc = ra ^ rb; + cv::bitwise_xor(ura, urb, urc); + urc.copyTo(rc0); + + /*std::cout << "==============================================\nafter op:\n"; + std::cout << "ra: " << rc0 << std::endl; + std::cout << "rc: " << rc << std::endl;*/ + + CHECK_DIFF(rc0, rc); + + rc = ra + rb; + cv::add(ura, urb, urc); + urc.copyTo(rc0); + + CHECK_DIFF(rc0, rc); + + cv::subtract(ra, Scalar::all(5), rc); + cv::subtract(ura, Scalar::all(5), urc); + urc.copyTo(rc0); + + CHECK_DIFF(rc0, rc); } catch (const test_excep& e) { diff --git a/modules/highgui/include/opencv2/highgui.hpp b/modules/highgui/include/opencv2/highgui.hpp index c4fc73a81f..cebf8fe22a 100644 --- a/modules/highgui/include/opencv2/highgui.hpp +++ b/modules/highgui/include/opencv2/highgui.hpp @@ -511,9 +511,10 @@ public: CV_WRAP virtual void release(); CV_WRAP virtual bool grab(); - CV_WRAP virtual bool retrieve(CV_OUT Mat& image, int flag = 0); + CV_WRAP virtual bool retrieve(OutputArray image, int flag = 0); virtual VideoCapture& operator >> (CV_OUT Mat& image); - CV_WRAP virtual bool read(CV_OUT Mat& image); + virtual VideoCapture& operator >> (CV_OUT UMat& image); + CV_WRAP virtual bool read(OutputArray image); CV_WRAP virtual bool set(int propId, double value); CV_WRAP virtual double get(int propId); diff --git a/modules/highgui/src/cap.cpp b/modules/highgui/src/cap.cpp index 04da481d01..0f4e6afb89 100644 --- a/modules/highgui/src/cap.cpp +++ b/modules/highgui/src/cap.cpp @@ -515,7 +515,7 @@ bool VideoCapture::grab() return cvGrabFrame(cap) != 0; } -bool VideoCapture::retrieve(Mat& image, int channel) +bool VideoCapture::retrieve(OutputArray image, int channel) { IplImage* _img = cvRetrieveFrame(cap, channel); if( !_img ) @@ -533,7 +533,7 @@ bool VideoCapture::retrieve(Mat& image, int channel) return true; } -bool VideoCapture::read(Mat& image) +bool VideoCapture::read(OutputArray image) { if(grab()) retrieve(image); @@ -548,6 +548,12 @@ VideoCapture& VideoCapture::operator >> (Mat& image) return *this; } +VideoCapture& VideoCapture::operator >> (UMat& image) +{ + read(image); + return *this; +} + bool VideoCapture::set(int propId, double value) { return cvSetCaptureProperty(cap, propId, value) != 0; diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp index 49312ba09b..b49d8db37a 100644 --- a/modules/imgproc/src/color.cpp +++ b/modules/imgproc/src/color.cpp @@ -2687,6 +2687,124 @@ struct mRGBA2RGBA } }; + +static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn ) +{ + bool ok = true; + UMat src = _src.getUMat(), dst; + Size sz = src.size(), dstSz = sz; + int scn = src.channels(), depth = src.depth(), bidx, dtype; + size_t globalsize[] = { src.cols, src.rows }; + ocl::Kernel k; + + if(depth != CV_8U && depth != CV_16U && depth != CV_32F) + return false; + + switch (code) + { + /* + case COLOR_BGR2BGRA: case COLOR_RGB2BGRA: case COLOR_BGRA2BGR: + case COLOR_RGBA2BGR: case COLOR_RGB2BGR: case COLOR_BGRA2RGBA: + case COLOR_BGR2BGR565: case COLOR_BGR2BGR555: case COLOR_RGB2BGR565: case COLOR_RGB2BGR555: + case COLOR_BGRA2BGR565: case COLOR_BGRA2BGR555: case COLOR_RGBA2BGR565: case COLOR_RGBA2BGR555: + case COLOR_BGR5652BGR: case COLOR_BGR5552BGR: case COLOR_BGR5652RGB: case COLOR_BGR5552RGB: + case COLOR_BGR5652BGRA: case COLOR_BGR5552BGRA: case COLOR_BGR5652RGBA: case COLOR_BGR5552RGBA: + */ + case COLOR_BGR2GRAY: + case COLOR_BGRA2GRAY: + case COLOR_RGB2GRAY: + case COLOR_RGBA2GRAY: + { + CV_Assert(scn == 3 || scn == 4); + bidx = code == COLOR_BGR2GRAY || code == COLOR_BGRA2GRAY ? 0 : 2; + dtype = depth; + k.create("RGB2Gray", ocl::imgproc::cvtcolor_oclsrc, + format("-D depth=%d -D scn=%d -D dcn=1 -D bidx=%d", depth, scn, bidx)); + break; + } + case COLOR_GRAY2BGR: + case COLOR_GRAY2BGRA: + { + CV_Assert(scn == 1); + dcn = code == COLOR_GRAY2BGRA ? 4 : 3; + dtype = CV_MAKETYPE(depth, dcn); + k.create("Gray2RGB", ocl::imgproc::cvtcolor_oclsrc, + format("-D depth=%d -D scn=1 -D dcn=%d", depth, dcn)); + break; + } + case COLOR_BGR2YUV: + case COLOR_RGB2YUV: + { + CV_Assert(scn == 3 || scn == 4); + bidx = code == COLOR_RGB2YUV ? 0 : 2; + k.create("RGB2YUV", ocl::imgproc::cvtcolor_oclsrc, + format("-D depth=%d -D scn=%d -D dcn=3 -D bidx=%d", depth, scn, bidx)); + break; + } + case COLOR_YUV2BGR: + case COLOR_YUV2RGB: + { + if(dcn < 0) dcn = 3; + CV_Assert(dcn == 3 || dcn == 4); + bidx = code == COLOR_YUV2RGB ? 0 : 2; + k.create("YUV2RGB", ocl::imgproc::cvtcolor_oclsrc, + format("-D depth=%d -D scn=3 -D dcn=%d -D bidx=%d", depth, dcn, bidx)); + break; + } + case COLOR_YUV2RGB_NV12: + case COLOR_YUV2BGR_NV12: + case COLOR_YUV2RGBA_NV12: + case COLOR_YUV2BGRA_NV12: + { + CV_Assert( scn == 1 ); + CV_Assert( sz.width % 2 == 0 && sz.height % 3 == 0 && depth == CV_8U ); + dcn = code == COLOR_YUV2BGRA_NV12 || code == COLOR_YUV2RGBA_NV12 ? 4 : 3; + bidx = code == COLOR_YUV2BGRA_NV12 || code == COLOR_YUV2BGR_NV12 ? 0 : 2; + + dstSz = Size(sz.width, sz.height * 2 / 3); + globalsize[0] = dstSz.height/2; + globalsize[1] = dstSz.width/2; + k.create("YUV2RGBA_NV12", ocl::imgproc::cvtcolor_oclsrc, + format("-D depth=0 -D scn=1 -D dcn=%d -D bidx=%d", dcn, bidx)); + break; + } + case COLOR_BGR2YCrCb: + case COLOR_RGB2YCrCb: + { + CV_Assert(scn == 3 || scn == 4); + bidx = code == COLOR_BGR2YCrCb ? 0 : 2; + k.create("RGB2YCrCb", ocl::imgproc::cvtcolor_oclsrc, + format("-D depth=%d -D scn=%d -D dcn=3 -D bidx=%d", depth, scn, bidx)); + break; + } + case COLOR_YCrCb2BGR: + case COLOR_YCrCb2RGB: + break; + /* + case COLOR_BGR5652GRAY: case COLOR_BGR5552GRAY: + case COLOR_GRAY2BGR565: case COLOR_GRAY2BGR555: + case COLOR_BGR2YCrCb: case COLOR_RGB2YCrCb: + case COLOR_BGR2XYZ: case COLOR_RGB2XYZ: + case COLOR_XYZ2BGR: case COLOR_XYZ2RGB: + case COLOR_BGR2HSV: case COLOR_RGB2HSV: case COLOR_BGR2HSV_FULL: case COLOR_RGB2HSV_FULL: + case COLOR_BGR2HLS: case COLOR_RGB2HLS: case COLOR_BGR2HLS_FULL: case COLOR_RGB2HLS_FULL: + case COLOR_HSV2BGR: case COLOR_HSV2RGB: case COLOR_HSV2BGR_FULL: case COLOR_HSV2RGB_FULL: + case COLOR_HLS2BGR: case COLOR_HLS2RGB: case COLOR_HLS2BGR_FULL: case COLOR_HLS2RGB_FULL: + */ + default: + ; + } + + if( !k.empty() ) + { + _dst.create(dstSz, dtype); + dst = _dst.getUMat(); + k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst)); + ok = k.run(2, globalsize, 0, false); + } + return ok; +} + }//namespace cv ////////////////////////////////////////////////////////////////////////////////////////// @@ -2695,9 +2813,15 @@ struct mRGBA2RGBA void cv::cvtColor( InputArray _src, OutputArray _dst, int code, int dcn ) { + bool use_opencl = ocl::useOpenCL() && _dst.kind() == _InputArray::UMAT; + int stype = _src.type(); + int scn = CV_MAT_CN(stype), depth = CV_MAT_DEPTH(stype), bidx; + + if( use_opencl && ocl_cvtColor(_src, _dst, code, dcn) ) + return; + Mat src = _src.getMat(), dst; Size sz = src.size(); - int scn = src.channels(), depth = src.depth(), bidx; CV_Assert( depth == CV_8U || depth == CV_16U || depth == CV_32F ); diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp index cf8c43cf3e..161b3332a1 100644 --- a/modules/imgproc/src/imgwarp.cpp +++ b/modules/imgproc/src/imgwarp.cpp @@ -1901,8 +1901,43 @@ private: }; #endif +static bool ocl_resize( InputArray _src, OutputArray _dst, + double fx, double fy, int interpolation) +{ + int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); + if( !(cn <= 4 && + (interpolation == INTER_NEAREST || + (interpolation == INTER_LINEAR && (depth == CV_8U || depth == CV_32F)))) ) + return false; + UMat src = _src.getUMat(), dst = _dst.getUMat(); + ocl::Kernel k; + + if (interpolation == INTER_LINEAR) + { + int wdepth = depth == CV_8U ? CV_32S : CV_32F; + int wtype = CV_MAKETYPE(wdepth, cn); + char buf[2][32]; + k.create("resizeLN", ocl::imgproc::resize_oclsrc, + format("-D INTER_LINEAR -D depth=%s -D PIXTYPE=%s -D WORKTYPE=%s -D convertToWT=%s -D convertToDT=%s", + depth, ocl::typeToStr(type), ocl::typeToStr(wtype), + ocl::convertTypeStr(depth, wdepth, cn, buf[0]), + ocl::convertTypeStr(wdepth, depth, cn, buf[1]))); + } + else if (interpolation == INTER_NEAREST) + { + k.create("resizeNN", ocl::imgproc::resize_oclsrc, + format("-D INTER_NEAREST -D PIXTYPE=%s", ocl::memopTypeToStr(type) )); + } + + if( k.empty() ) + return false; + k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst), + (float)(1./fx), (float)(1./fy)); + size_t globalsize[] = { dst.cols, dst.rows }; + return k.run(2, globalsize, 0, false); } +} ////////////////////////////////////////////////////////////////////////////////////////// @@ -2013,25 +2048,28 @@ void cv::resize( InputArray _src, OutputArray _dst, Size dsize, resizeArea_, 0 }; - Mat src = _src.getMat(); - Size ssize = src.size(); + Size ssize = _src.size(); CV_Assert( ssize.area() > 0 ); CV_Assert( dsize.area() || (inv_scale_x > 0 && inv_scale_y > 0) ); if( !dsize.area() ) { - dsize = Size(saturate_cast(src.cols*inv_scale_x), - saturate_cast(src.rows*inv_scale_y)); + dsize = Size(saturate_cast(ssize.width*inv_scale_x), + saturate_cast(ssize.height*inv_scale_y)); CV_Assert( dsize.area() ); } else { - inv_scale_x = (double)dsize.width/src.cols; - inv_scale_y = (double)dsize.height/src.rows; + inv_scale_x = (double)dsize.width/ssize.width; + inv_scale_y = (double)dsize.height/ssize.height; } - _dst.create(dsize, src.type()); - Mat dst = _dst.getMat(); + _dst.create(dsize, _src.type()); + if( ocl::useOpenCL() && _dst.kind() == _InputArray::UMAT && + ocl_resize(_src, _dst, inv_scale_x, inv_scale_y, interpolation) ) + return; + + Mat src = _src.getMat(), dst = _dst.getMat(); #ifdef HAVE_TEGRA_OPTIMIZATION if (tegra::resize(src, dst, (float)inv_scale_x, (float)inv_scale_y, interpolation)) diff --git a/modules/imgproc/src/opencl/bilateral.cl b/modules/imgproc/src/opencl/bilateral.cl new file mode 100644 index 0000000000..cb317a0057 --- /dev/null +++ b/modules/imgproc/src/opencl/bilateral.cl @@ -0,0 +1,145 @@ +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Rock Li, Rock.li@amd.com +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors as is and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. + +__kernel void bilateral_C1_D0(__global uchar *dst, + __global const uchar *src, + const int dst_rows, + const int dst_cols, + const int maxk, + const int radius, + const int dst_step, + const int dst_offset, + const int src_step, + const int src_rows, + const int src_cols, + __constant float *color_weight, + __constant float *space_weight, + __constant int *space_ofs) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + if (y < dst_rows && x < dst_cols) + { + int src_index = mad24(y + radius, src_step, x + radius); + int dst_index = mad24(y, dst_step, x + dst_offset); + float sum = 0.f, wsum = 0.f; + + int val0 = (int)src[src_index]; + for(int k = 0; k < maxk; k++ ) + { + int val = (int)src[src_index + space_ofs[k]]; + float w = space_weight[k] * color_weight[abs(val - val0)]; + sum += (float)(val) * w; + wsum += w; + } + dst[dst_index] = convert_uchar_rtz(sum / wsum + 0.5f); + } +} + +__kernel void bilateral2_C1_D0(__global uchar *dst, + __global const uchar *src, + const int dst_rows, + const int dst_cols, + const int maxk, + const int radius, + const int dst_step, + const int dst_offset, + const int src_step, + const int src_rows, + const int src_cols, + __constant float *color_weight, + __constant float *space_weight, + __constant int *space_ofs) +{ + int x = get_global_id(0) << 2; + int y = get_global_id(1); + + if (y < dst_rows && x < dst_cols) + { + int src_index = mad24(y + radius, src_step, x + radius); + int dst_index = mad24(y, dst_step, x + dst_offset); + float4 sum = (float4)(0.f), wsum = (float4)(0.f); + + int4 val0 = convert_int4(vload4(0,src + src_index)); + for(int k = 0; k < maxk; k++ ) + { + int4 val = convert_int4(vload4(0,src+src_index + space_ofs[k])); + float4 w = (float4)(space_weight[k]) * (float4)(color_weight[abs(val.x - val0.x)], color_weight[abs(val.y - val0.y)], + color_weight[abs(val.z - val0.z)], color_weight[abs(val.w - val0.w)]); + sum += convert_float4(val) * w; + wsum += w; + } + *(__global uchar4*)(dst+dst_index) = convert_uchar4_rtz(sum/wsum+0.5f); + } +} + +__kernel void bilateral_C4_D0(__global uchar4 *dst, + __global const uchar4 *src, + const int dst_rows, + const int dst_cols, + const int maxk, + const int radius, + const int dst_step, + const int dst_offset, + const int src_step, + const int src_rows, + const int src_cols, + __constant float *color_weight, + __constant float *space_weight, + __constant int *space_ofs) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + if (y < dst_rows && x < dst_cols) + { + int src_index = mad24(y + radius, src_step, x + radius); + int dst_index = mad24(y, dst_step, x + dst_offset); + float4 sum = (float4)0.f; + float wsum = 0.f; + + int4 val0 = convert_int4(src[src_index]); + for(int k = 0; k < maxk; k++ ) + { + int4 val = convert_int4(src[src_index + space_ofs[k]]); + float w = space_weight[k] * color_weight[abs(val.x - val0.x) + abs(val.y - val0.y) + abs(val.z - val0.z)]; + sum += convert_float4(val) * (float4)w; + wsum += w; + } + + wsum = 1.f / wsum; + dst[dst_index] = convert_uchar4_rtz(sum * (float4)wsum + (float4)0.5f); + } +} diff --git a/modules/imgproc/src/opencl/boxfilter.cl b/modules/imgproc/src/opencl/boxfilter.cl new file mode 100644 index 0000000000..030c13cc57 --- /dev/null +++ b/modules/imgproc/src/opencl/boxfilter.cl @@ -0,0 +1,478 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Zhang Ying, zhangying913@gmail.com +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors as is and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////Macro for border type//////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////////////////////////////// +#ifdef BORDER_REPLICATE +//BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh +#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (l_edge) : (i)) +#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (r_edge)-1 : (addr)) +#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) :(i)) +#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (b_edge)-1 :(addr)) +#endif + +#ifdef BORDER_REFLECT +//BORDER_REFLECT: fedcba|abcdefgh|hgfedcb +#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i)-1 : (i)) +#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr)) +#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i)-1 : (i)) +#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr)) +#endif + +#ifdef BORDER_REFLECT_101 +//BORDER_REFLECT_101: gfedcb|abcdefgh|gfedcba +#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i) : (i)) +#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr)) +#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i) : (i)) +#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr)) +#endif + +//blur function does not support BORDER_WRAP +#ifdef BORDER_WRAP +//BORDER_WRAP: cdefgh|abcdefgh|abcdefg +#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (i)+(r_edge) : (i)) +#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (i)-(r_edge) : (addr)) +#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (i)+(b_edge) : (i)) +#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (i)-(b_edge) : (addr)) +#endif + +#define THREADS 256 +#define ELEM(i, l_edge, r_edge, elem1, elem2) (i) >= (l_edge) && (i) < (r_edge) ? (elem1) : (elem2) + +inline void update_dst_C1_D0(__global uchar *dst, __local uint* temp, + int dst_rows, int dst_cols, + int dst_startX, int dst_x_off, + float alpha) +{ + if(get_local_id(0) < anX || get_local_id(0) >= (THREADS-ksX+anX+1)) + { + return; + } + + uint4 tmp_sum = 0; + int posX = dst_startX - dst_x_off + (get_local_id(0)-anX)*4; + int posY = (get_group_id(1) << 1); + + for(int i=-anX; i<=anX; i++) + { + tmp_sum += vload4(get_local_id(0), temp+i); + } + + if(posY < dst_rows && posX < dst_cols) + { + tmp_sum /= (uint4) alpha; + if(posX >= 0 && posX < dst_cols) + *(dst) = tmp_sum.x; + if(posX+1 >= 0 && posX+1 < dst_cols) + *(dst + 1) = tmp_sum.y; + if(posX+2 >= 0 && posX+2 < dst_cols) + *(dst + 2) = tmp_sum.z; + if(posX+3 >= 0 && posX+3 < dst_cols) + *(dst + 3) = tmp_sum.w; + } +} + + +inline void update_dst_C4_D0(__global uchar4 *dst, __local uint4* temp, + int dst_rows, int dst_cols, + int dst_startX, int dst_x_off, + float alpha) +{ + if(get_local_id(0) >= (THREADS-ksX+1)) + { + return; + } + + int posX = dst_startX - dst_x_off + get_local_id(0); + int posY = (get_group_id(1) << 1); + + uint4 temp_sum = 0; + for(int i=-anX; i<=anX; i++) + { + temp_sum += temp[get_local_id(0) + anX + i]; + } + + if(posX >= 0 && posX < dst_cols && posY >= 0 && posY < dst_rows) + *dst = convert_uchar4(convert_float4(temp_sum)/alpha); +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////8uC1//////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////// +__kernel void boxFilter_C1_D0(__global const uchar * restrict src, __global uchar *dst, float alpha, + int src_offset, int src_whole_rows, int src_whole_cols, int src_step, + int dst_offset, int dst_rows, int dst_cols, int dst_step + ) +{ + + int col = get_local_id(0); + const int gX = get_group_id(0); + const int gY = get_group_id(1); + int src_x_off = src_offset % src_step; + int src_y_off = src_offset / src_step; + int dst_x_off = dst_offset % dst_step; + int dst_y_off = dst_offset / dst_step; + + int head_off = dst_x_off%4; + int startX = ((gX * (THREADS-ksX+1)-anX) * 4) - head_off + src_x_off; + int startY = (gY << 1) - anY + src_y_off; + int dst_startX = (gX * (THREADS-ksX+1) * 4) - head_off + dst_x_off; + int dst_startY = (gY << 1) + dst_y_off; + + uint4 data[ksY+1]; + __local uint4 temp[2][THREADS]; + +#ifdef BORDER_CONSTANT + + for(int i=0; i < ksY+1; i++) + { + if(startY+i >=0 && startY+i < src_whole_rows && startX+col*4 >=0 && startX+col*4+3=0 && startY+i < src_whole_rows && startX+col*4 >=0 && startX+col*4=0 && startY+i < src_whole_rows && startX+col*4+1 >=0 && startX+col*4+1=0 && startY+i < src_whole_rows && startX+col*4+2 >=0 && startX+col*4+2=0 && startY+i < src_whole_rows && startX+col*4+3 >=0 && startX+col*4+3src_whole_cols-1) + | (startY+i<0) | (startY+i>src_whole_rows-1); + if(not_all_in_range) + { + int selected_row; + int4 selected_col; + selected_row = ADDR_H(startY+i, 0, src_whole_rows); + selected_row = ADDR_B(startY+i, src_whole_rows, selected_row); + + selected_col.x = ADDR_L(startX+col*4, 0, src_whole_cols); + selected_col.x = ADDR_R(startX+col*4, src_whole_cols, selected_col.x); + + selected_col.y = ADDR_L(startX+col*4+1, 0, src_whole_cols); + selected_col.y = ADDR_R(startX+col*4+1, src_whole_cols, selected_col.y); + + selected_col.z = ADDR_L(startX+col*4+2, 0, src_whole_cols); + selected_col.z = ADDR_R(startX+col*4+2, src_whole_cols, selected_col.z); + + selected_col.w = ADDR_L(startX+col*4+3, 0, src_whole_cols); + selected_col.w = ADDR_R(startX+col*4+3, src_whole_cols, selected_col.w); + + data[i].x = *(src + selected_row * src_step + selected_col.x); + data[i].y = *(src + selected_row * src_step + selected_col.y); + data[i].z = *(src + selected_row * src_step + selected_col.z); + data[i].w = *(src + selected_row * src_step + selected_col.w); + } + else + { + data[i] = convert_uint4(vload4(col,(__global uchar*)(src+(startY+i)*src_step + startX))); + } + } +#endif + uint4 tmp_sum = 0; + for(int i=1; i < ksY; i++) + { + tmp_sum += (data[i]); + } + + int index = dst_startY * dst_step + dst_startX + (col-anX)*4; + + temp[0][col] = tmp_sum + (data[0]); + temp[1][col] = tmp_sum + (data[ksY]); + barrier(CLK_LOCAL_MEM_FENCE); + update_dst_C1_D0(dst+index, (__local uint *)(temp[0]), + dst_rows, dst_cols, dst_startX, dst_x_off, alpha); + update_dst_C1_D0(dst+index+dst_step, (__local uint *)(temp[1]), + dst_rows, dst_cols, dst_startX, dst_x_off, alpha); + +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////8uC4//////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////// +__kernel void boxFilter_C4_D0(__global const uchar4 * restrict src, __global uchar4 *dst, float alpha, + int src_offset, int src_whole_rows, int src_whole_cols, int src_step, + int dst_offset, int dst_rows, int dst_cols, int dst_step + ) +{ + int col = get_local_id(0); + const int gX = get_group_id(0); + const int gY = get_group_id(1); + + int src_x_off = (src_offset % src_step) >> 2; + int src_y_off = src_offset / src_step; + int dst_x_off = (dst_offset % dst_step) >> 2; + int dst_y_off = dst_offset / dst_step; + + int startX = gX * (THREADS-ksX+1) - anX + src_x_off; + int startY = (gY << 1) - anY + src_y_off; + int dst_startX = gX * (THREADS-ksX+1) + dst_x_off; + int dst_startY = (gY << 1) + dst_y_off; + + uint4 data[ksY+1]; + __local uint4 temp[2][THREADS]; + +#ifdef BORDER_CONSTANT + bool con; + for(int i=0; i < ksY+1; i++) + { + con = startX+col >= 0 && startX+col < src_whole_cols && startY+i >= 0 && startY+i < src_whole_rows; + int cur_col = clamp(startX + col, 0, src_whole_cols); + + data[i].x = con ? src[(startY+i)*(src_step>>2) + cur_col].x : 0; + data[i].y = con ? src[(startY+i)*(src_step>>2) + cur_col].y : 0; + data[i].z = con ? src[(startY+i)*(src_step>>2) + cur_col].z : 0; + data[i].w = con ? src[(startY+i)*(src_step>>2) + cur_col].w : 0; + } +#else + for(int i=0; i < ksY+1; i++) + { + int selected_row; + int selected_col; + selected_row = ADDR_H(startY+i, 0, src_whole_rows); + selected_row = ADDR_B(startY+i, src_whole_rows, selected_row); + + selected_col = ADDR_L(startX+col, 0, src_whole_cols); + selected_col = ADDR_R(startX+col, src_whole_cols, selected_col); + + + data[i] = convert_uint4(src[selected_row * (src_step>>2) + selected_col]); + } + +#endif + uint4 tmp_sum = 0; + for(int i=1; i < ksY; i++) + { + tmp_sum += (data[i]); + } + + int index = dst_startY * (dst_step>>2)+ dst_startX + col; + + temp[0][col] = tmp_sum + (data[0]); + temp[1][col] = tmp_sum + (data[ksY]); + barrier(CLK_LOCAL_MEM_FENCE); + update_dst_C4_D0(dst+index, (__local uint4 *)(temp[0]), + dst_rows, dst_cols, dst_startX, dst_x_off, alpha); + update_dst_C4_D0(dst+index+(dst_step>>2), (__local uint4 *)(temp[1]), + dst_rows, dst_cols, dst_startX, dst_x_off, alpha); + +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////32fC1//////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////// +__kernel void boxFilter_C1_D5(__global const float *restrict src, __global float *dst, float alpha, + int src_offset, int src_whole_rows, int src_whole_cols, int src_step, + int dst_offset, int dst_rows, int dst_cols, int dst_step + ) +{ + int col = get_local_id(0); + const int gX = get_group_id(0); + const int gY = get_group_id(1); + + int src_x_off = (src_offset % src_step) >> 2; + int src_y_off = src_offset / src_step; + int dst_x_off = (dst_offset % dst_step) >> 2; + int dst_y_off = dst_offset / dst_step; + + int startX = gX * (THREADS-ksX+1) - anX + src_x_off; + int startY = (gY << 1) - anY + src_y_off; + int dst_startX = gX * (THREADS-ksX+1) + dst_x_off; + int dst_startY = (gY << 1) + dst_y_off; + float data[ksY+1]; + __local float temp[2][THREADS]; +#ifdef BORDER_CONSTANT + bool con; + float ss; + for(int i=0; i < ksY+1; i++) + { + con = startX+col >= 0 && startX+col < src_whole_cols && startY+i >= 0 && startY+i < src_whole_rows; + + int cur_col = clamp(startX + col, 0, src_whole_cols); + ss = (startY+i)=0&&cur_col>=0&&cur_col>2) + cur_col]:(float)0; + + data[i] = con ? ss : 0.f; + } +#else + for(int i=0; i < ksY+1; i++) + { + int selected_row; + int selected_col; + selected_row = ADDR_H(startY+i, 0, src_whole_rows); + selected_row = ADDR_B(startY+i, src_whole_rows, selected_row); + + selected_col = ADDR_L(startX+col, 0, src_whole_cols); + selected_col = ADDR_R(startX+col, src_whole_cols, selected_col); + + data[i] = src[selected_row * (src_step>>2) + selected_col]; + } + +#endif + float sum0 = 0.0, sum1 = 0.0, sum2 = 0.0; + for(int i=1; i < ksY; i++) + { + sum0 += (data[i]); + } + sum1 = sum0 + (data[0]); + sum2 = sum0 + (data[ksY]); + temp[0][col] = sum1; + temp[1][col] = sum2; + barrier(CLK_LOCAL_MEM_FENCE); + if(col < (THREADS-(ksX-1))) + { + col += anX; + int posX = dst_startX - dst_x_off + col - anX; + int posY = (gY << 1); + + float tmp_sum[2]= {0.0, 0.0}; + for(int k=0; k<2; k++) + for(int i=-anX; i<=anX; i++) + { + tmp_sum[k] += temp[k][col+i]; + } + for(int i=0; i<2; i++) + { + if(posX >= 0 && posX < dst_cols && (posY+i) >= 0 && (posY+i) < dst_rows) + dst[(dst_startY+i) * (dst_step>>2)+ dst_startX + col - anX] = tmp_sum[i]/alpha; + } + + } +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////32fC4//////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////// +__kernel void boxFilter_C4_D5(__global const float4 *restrict src, __global float4 *dst, float alpha, + int src_offset, int src_whole_rows, int src_whole_cols, int src_step, + int dst_offset, int dst_rows, int dst_cols, int dst_step + ) +{ + int col = get_local_id(0); + const int gX = get_group_id(0); + const int gY = get_group_id(1); + + int src_x_off = (src_offset % src_step) >> 4; + int src_y_off = src_offset / src_step; + int dst_x_off = (dst_offset % dst_step) >> 4; + int dst_y_off = dst_offset / dst_step; + + int startX = gX * (THREADS-ksX+1) - anX + src_x_off; + int startY = (gY << 1) - anY + src_y_off; + int dst_startX = gX * (THREADS-ksX+1) + dst_x_off; + int dst_startY = (gY << 1) + dst_y_off; + float4 data[ksY+1]; + __local float4 temp[2][THREADS]; +#ifdef BORDER_CONSTANT + bool con; + float4 ss; + for(int i=0; i < ksY+1; i++) + { + con = startX+col >= 0 && startX+col < src_whole_cols && startY+i >= 0 && startY+i < src_whole_rows; + + int cur_col = clamp(startX + col, 0, src_whole_cols); + ss = (startY+i)=0&&cur_col>=0&&cur_col>4) + cur_col]:(float4)0; + + data[i] = con ? ss : (float4)(0.0,0.0,0.0,0.0); + } +#else + for(int i=0; i < ksY+1; i++) + { + int selected_row; + int selected_col; + selected_row = ADDR_H(startY+i, 0, src_whole_rows); + selected_row = ADDR_B(startY+i, src_whole_rows, selected_row); + + selected_col = ADDR_L(startX+col, 0, src_whole_cols); + selected_col = ADDR_R(startX+col, src_whole_cols, selected_col); + + data[i] = src[selected_row * (src_step>>4) + selected_col]; + } + +#endif + float4 sum0 = 0.0, sum1 = 0.0, sum2 = 0.0; + for(int i=1; i < ksY; i++) + { + sum0 += (data[i]); + } + sum1 = sum0 + (data[0]); + sum2 = sum0 + (data[ksY]); + temp[0][col] = sum1; + temp[1][col] = sum2; + barrier(CLK_LOCAL_MEM_FENCE); + if(col < (THREADS-(ksX-1))) + { + col += anX; + int posX = dst_startX - dst_x_off + col - anX; + int posY = (gY << 1); + + float4 tmp_sum[2]= {(float4)(0.0,0.0,0.0,0.0), (float4)(0.0,0.0,0.0,0.0)}; + for(int k=0; k<2; k++) + for(int i=-anX; i<=anX; i++) + { + tmp_sum[k] += temp[k][col+i]; + } + for(int i=0; i<2; i++) + { + if(posX >= 0 && posX < dst_cols && (posY+i) >= 0 && (posY+i) < dst_rows) + dst[(dst_startY+i) * (dst_step>>4)+ dst_startX + col - anX] = tmp_sum[i]/alpha; + } + + } +} diff --git a/modules/imgproc/src/opencl/canny.cl b/modules/imgproc/src/opencl/canny.cl new file mode 100644 index 0000000000..ca670b6db7 --- /dev/null +++ b/modules/imgproc/src/opencl/canny.cl @@ -0,0 +1,636 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Peng Xiao, pengxiao@multicorewareinc.com +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors as is and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable +#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable + +#ifdef L2GRAD +inline float calc(int x, int y) +{ + return sqrt((float)(x * x + y * y)); +} +#else +inline float calc(int x, int y) +{ + return (float)abs(x) + abs(y); +} +#endif // + +// Smoothing perpendicular to the derivative direction with a triangle filter +// only support 3x3 Sobel kernel +// h (-1) = 1, h (0) = 2, h (1) = 1 +// h'(-1) = -1, h'(0) = 0, h'(1) = 1 +// thus sobel 2D operator can be calculated as: +// h'(x, y) = h'(x)h(y) for x direction +// +// src input 8bit single channel image data +// dx_buf output dx buffer +// dy_buf output dy buffer +__kernel +void +__attribute__((reqd_work_group_size(16,16,1))) +calcSobelRowPass +( + __global const uchar * src, + __global int * dx_buf, + __global int * dy_buf, + int rows, + int cols, + int src_step, + int src_offset, + int dx_buf_step, + int dx_buf_offset, + int dy_buf_step, + int dy_buf_offset +) +{ + dx_buf_step /= sizeof(*dx_buf); + dx_buf_offset /= sizeof(*dx_buf); + dy_buf_step /= sizeof(*dy_buf); + dy_buf_offset /= sizeof(*dy_buf); + + int gidx = get_global_id(0); + int gidy = get_global_id(1); + + int lidx = get_local_id(0); + int lidy = get_local_id(1); + + __local int smem[16][18]; + + smem[lidy][lidx + 1] = + src[gidx + min(gidy, rows - 1) * src_step + src_offset]; + if(lidx == 0) + { + smem[lidy][0] = + src[max(gidx - 1, 0) + min(gidy, rows - 1) * src_step + src_offset]; + smem[lidy][17] = + src[min(gidx + 16, cols - 1) + min(gidy, rows - 1) * src_step + src_offset]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if(gidy < rows && gidx < cols) + { + dx_buf[gidx + gidy * dx_buf_step + dx_buf_offset] = + -smem[lidy][lidx] + smem[lidy][lidx + 2]; + dy_buf[gidx + gidy * dy_buf_step + dy_buf_offset] = + smem[lidy][lidx] + 2 * smem[lidy][lidx + 1] + smem[lidy][lidx + 2]; + } +} + +// calculate the magnitude of the filter pass combining both x and y directions +// This is the buffered version(3x3 sobel) +// +// dx_buf dx buffer, calculated from calcSobelRowPass +// dy_buf dy buffer, calculated from calcSobelRowPass +// dx direvitive in x direction output +// dy direvitive in y direction output +// mag magnitude direvitive of xy output +__kernel +void +__attribute__((reqd_work_group_size(16,16,1))) +calcMagnitude_buf +( + __global const int * dx_buf, + __global const int * dy_buf, + __global int * dx, + __global int * dy, + __global float * mag, + int rows, + int cols, + int dx_buf_step, + int dx_buf_offset, + int dy_buf_step, + int dy_buf_offset, + int dx_step, + int dx_offset, + int dy_step, + int dy_offset, + int mag_step, + int mag_offset +) +{ + dx_buf_step /= sizeof(*dx_buf); + dx_buf_offset /= sizeof(*dx_buf); + dy_buf_step /= sizeof(*dy_buf); + dy_buf_offset /= sizeof(*dy_buf); + dx_step /= sizeof(*dx); + dx_offset /= sizeof(*dx); + dy_step /= sizeof(*dy); + dy_offset /= sizeof(*dy); + mag_step /= sizeof(*mag); + mag_offset /= sizeof(*mag); + + int gidx = get_global_id(0); + int gidy = get_global_id(1); + + int lidx = get_local_id(0); + int lidy = get_local_id(1); + + __local int sdx[18][16]; + __local int sdy[18][16]; + + sdx[lidy + 1][lidx] = + dx_buf[gidx + min(gidy, rows - 1) * dx_buf_step + dx_buf_offset]; + sdy[lidy + 1][lidx] = + dy_buf[gidx + min(gidy, rows - 1) * dy_buf_step + dy_buf_offset]; + if(lidy == 0) + { + sdx[0][lidx] = + dx_buf[gidx + min(max(gidy-1,0),rows-1) * dx_buf_step + dx_buf_offset]; + sdx[17][lidx] = + dx_buf[gidx + min(gidy + 16, rows - 1) * dx_buf_step + dx_buf_offset]; + + sdy[0][lidx] = + dy_buf[gidx + min(max(gidy-1,0),rows-1) * dy_buf_step + dy_buf_offset]; + sdy[17][lidx] = + dy_buf[gidx + min(gidy + 16, rows - 1) * dy_buf_step + dy_buf_offset]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if(gidx < cols && gidy < rows) + { + int x = sdx[lidy][lidx] + 2 * sdx[lidy + 1][lidx] + sdx[lidy + 2][lidx]; + int y = -sdy[lidy][lidx] + sdy[lidy + 2][lidx]; + + dx[gidx + gidy * dx_step + dx_offset] = x; + dy[gidx + gidy * dy_step + dy_offset] = y; + + mag[(gidx + 1) + (gidy + 1) * mag_step + mag_offset] = calc(x, y); + } +} + +// calculate the magnitude of the filter pass combining both x and y directions +// This is the non-buffered version(non-3x3 sobel) +// +// dx_buf dx buffer, calculated from calcSobelRowPass +// dy_buf dy buffer, calculated from calcSobelRowPass +// dx direvitive in x direction output +// dy direvitive in y direction output +// mag magnitude direvitive of xy output +__kernel +void calcMagnitude +( + __global const int * dx, + __global const int * dy, + __global float * mag, + int rows, + int cols, + int dx_step, + int dx_offset, + int dy_step, + int dy_offset, + int mag_step, + int mag_offset +) +{ + dx_step /= sizeof(*dx); + dx_offset /= sizeof(*dx); + dy_step /= sizeof(*dy); + dy_offset /= sizeof(*dy); + mag_step /= sizeof(*mag); + mag_offset /= sizeof(*mag); + + int gidx = get_global_id(0); + int gidy = get_global_id(1); + + if(gidy < rows && gidx < cols) + { + mag[(gidx + 1) + (gidy + 1) * mag_step + mag_offset] = + calc( + dx[gidx + gidy * dx_step + dx_offset], + dy[gidx + gidy * dy_step + dy_offset] + ); + } +} + +////////////////////////////////////////////////////////////////////////////////////////// +// 0.4142135623730950488016887242097 is tan(22.5) +#define CANNY_SHIFT 15 +#define TG22 (int)(0.4142135623730950488016887242097*(1< low_thresh) + { + const int tg22x = x * TG22; + const int tg67x = tg22x + (x << (1 + CANNY_SHIFT)); + y <<= CANNY_SHIFT; + if(y < tg22x) + { + if(m > smem[lidy + 1][lidx] && m >= smem[lidy + 1][lidx + 2]) + { + edge_type = 1 + (int)(m > high_thresh); + } + } + else if (y > tg67x) + { + if(m > smem[lidy][lidx + 1]&& m >= smem[lidy + 2][lidx + 1]) + { + edge_type = 1 + (int)(m > high_thresh); + } + } + else + { + if(m > smem[lidy][lidx + 1 - s]&& m > smem[lidy + 2][lidx + 1 + s]) + { + edge_type = 1 + (int)(m > high_thresh); + } + } + } + map[gidx + 1 + (gidy + 1) * map_step] = edge_type; + } +} + +#undef CANNY_SHIFT +#undef TG22 + +////////////////////////////////////////////////////////////////////////////////////////// +// do Hysteresis for pixel whose edge type is 1 +// +// If candidate pixel (edge type is 1) has a neighbour pixel (in 3x3 area) with type 2, it is believed to be part of an edge and +// marked as edge. Each thread will iterate for 16 times to connect local edges. +// Candidate pixel being identified as edge will then be tested if there is nearby potiential edge points. If there is, counter will +// be incremented by 1 and the point location is stored. These potiential candidates will be processed further in next kernel. +// +// map raw edge type results calculated from calcMap. +// st the potiential edge points found in this kernel call +// counter the number of potiential edge points +__kernel +void +__attribute__((reqd_work_group_size(16,16,1))) +edgesHysteresisLocal +( + __global int * map, + __global ushort2 * st, + __global unsigned int * counter, + int rows, + int cols, + int map_step, + int map_offset +) +{ + map_step /= sizeof(*map); + map_offset /= sizeof(*map); + + map += map_offset; + + __local int smem[18][18]; + + int gidx = get_global_id(0); + int gidy = get_global_id(1); + + int lidx = get_local_id(0); + int lidy = get_local_id(1); + + int grp_idx = get_global_id(0) & 0xFFFFF0; + int grp_idy = get_global_id(1) & 0xFFFFF0; + + int tid = lidx + lidy * 16; + int lx = tid % 18; + int ly = tid / 18; + if(ly < 14) + { + smem[ly][lx] = + map[grp_idx + lx + min(grp_idy + ly, rows - 1) * map_step]; + } + if(ly < 4 && grp_idy + ly + 14 <= rows && grp_idx + lx <= cols) + { + smem[ly + 14][lx] = + map[grp_idx + lx + min(grp_idy + ly + 14, rows - 1) * map_step]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if(gidy < rows && gidx < cols) + { + int n; + + #pragma unroll + for (int k = 0; k < 16; ++k) + { + n = 0; + + if (smem[lidy + 1][lidx + 1] == 1) + { + n += smem[lidy ][lidx ] == 2; + n += smem[lidy ][lidx + 1] == 2; + n += smem[lidy ][lidx + 2] == 2; + + n += smem[lidy + 1][lidx ] == 2; + n += smem[lidy + 1][lidx + 2] == 2; + + n += smem[lidy + 2][lidx ] == 2; + n += smem[lidy + 2][lidx + 1] == 2; + n += smem[lidy + 2][lidx + 2] == 2; + } + + if (n > 0) + smem[lidy + 1][lidx + 1] = 2; + } + + const int e = smem[lidy + 1][lidx + 1]; + map[gidx + 1 + (gidy + 1) * map_step] = e; + + n = 0; + if(e == 2) + { + n += smem[lidy ][lidx ] == 1; + n += smem[lidy ][lidx + 1] == 1; + n += smem[lidy ][lidx + 2] == 1; + + n += smem[lidy + 1][lidx ] == 1; + n += smem[lidy + 1][lidx + 2] == 1; + + n += smem[lidy + 2][lidx ] == 1; + n += smem[lidy + 2][lidx + 1] == 1; + n += smem[lidy + 2][lidx + 2] == 1; + } + + if(n > 0) + { + unsigned int ind = atomic_inc(counter); + st[ind] = (ushort2)(gidx + 1, gidy + 1); + } + } +} + +__constant int c_dx[8] = {-1, 0, 1, -1, 1, -1, 0, 1}; +__constant int c_dy[8] = {-1, -1, -1, 0, 0, 1, 1, 1}; + + +#define stack_size 512 +__kernel +void +__attribute__((reqd_work_group_size(128,1,1))) +edgesHysteresisGlobal +( + __global int * map, + __global ushort2 * st1, + __global ushort2 * st2, + __global int * counter, + int rows, + int cols, + int count, + int map_step, + int map_offset +) +{ + + map_step /= sizeof(*map); + map_offset /= sizeof(*map); + + map += map_offset; + + int gidx = get_global_id(0); + int gidy = get_global_id(1); + + int lidx = get_local_id(0); + int lidy = get_local_id(1); + + int grp_idx = get_group_id(0); + int grp_idy = get_group_id(1); + + __local unsigned int s_counter; + __local unsigned int s_ind; + + __local ushort2 s_st[stack_size]; + + if(lidx == 0) + { + s_counter = 0; + } + barrier(CLK_LOCAL_MEM_FENCE); + + int ind = mad24(grp_idy, (int)get_local_size(0), grp_idx); + + if(ind < count) + { + ushort2 pos = st1[ind]; + if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows) + { + if (lidx < 8) + { + pos.x += c_dx[lidx]; + pos.y += c_dy[lidx]; + + if (map[pos.x + pos.y * map_step] == 1) + { + map[pos.x + pos.y * map_step] = 2; + + ind = atomic_inc(&s_counter); + + s_st[ind] = pos; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + + while (s_counter > 0 && s_counter <= stack_size - get_local_size(0)) + { + const int subTaskIdx = lidx >> 3; + const int portion = min(s_counter, (uint)(get_local_size(0)>> 3)); + + pos.x = pos.y = 0; + + if (subTaskIdx < portion) + pos = s_st[s_counter - 1 - subTaskIdx]; + barrier(CLK_LOCAL_MEM_FENCE); + + if (lidx == 0) + s_counter -= portion; + barrier(CLK_LOCAL_MEM_FENCE); + + if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows) + { + pos.x += c_dx[lidx & 7]; + pos.y += c_dy[lidx & 7]; + + if (map[pos.x + pos.y * map_step] == 1) + { + map[pos.x + pos.y * map_step] = 2; + + ind = atomic_inc(&s_counter); + + s_st[ind] = pos; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + if (s_counter > 0) + { + if (lidx == 0) + { + ind = atomic_add(counter, s_counter); + s_ind = ind - s_counter; + } + barrier(CLK_LOCAL_MEM_FENCE); + + ind = s_ind; + + for (int i = lidx; i < s_counter; i += get_local_size(0)) + { + st2[ind + i] = s_st[i]; + } + } + } + } +} +#undef stack_size + +//Get the edge result. egde type of value 2 will be marked as an edge point and set to 255. Otherwise 0. +// map edge type mappings +// dst edge output +__kernel +void getEdges +( + __global const int * map, + __global uchar * dst, + int rows, + int cols, + int map_step, + int map_offset, + int dst_step, + int dst_offset +) +{ + map_step /= sizeof(*map); + map_offset /= sizeof(*map); + + int gidx = get_global_id(0); + int gidy = get_global_id(1); + + if(gidy < rows && gidx < cols) + { + dst[gidx + gidy * dst_step] = (uchar)(-(map[gidx + 1 + (gidy + 1) * map_step + map_offset] >> 1)); + } +} diff --git a/modules/imgproc/src/opencl/clahe.cl b/modules/imgproc/src/opencl/clahe.cl new file mode 100644 index 0000000000..16c68fd474 --- /dev/null +++ b/modules/imgproc/src/opencl/clahe.cl @@ -0,0 +1,255 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Sen Liu, swjtuls1987@126.com +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors as is and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef WAVE_SIZE +#define WAVE_SIZE 1 +#endif + +int calc_lut(__local int* smem, int val, int tid) +{ + smem[tid] = val; + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid == 0) + for (int i = 1; i < 256; ++i) + smem[i] += smem[i - 1]; + barrier(CLK_LOCAL_MEM_FENCE); + + return smem[tid]; +} + +#ifdef CPU +void reduce(volatile __local int* smem, int val, int tid) +{ + smem[tid] = val; + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 128) + smem[tid] = val += smem[tid + 128]; + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 64) + smem[tid] = val += smem[tid + 64]; + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 32) + smem[tid] += smem[tid + 32]; + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 16) + smem[tid] += smem[tid + 16]; + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 8) + smem[tid] += smem[tid + 8]; + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 4) + smem[tid] += smem[tid + 4]; + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 2) + smem[tid] += smem[tid + 2]; + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 1) + smem[256] = smem[tid] + smem[tid + 1]; + barrier(CLK_LOCAL_MEM_FENCE); +} + +#else + +void reduce(__local volatile int* smem, int val, int tid) +{ + smem[tid] = val; + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 128) + smem[tid] = val += smem[tid + 128]; + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 64) + smem[tid] = val += smem[tid + 64]; + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 32) + { + smem[tid] += smem[tid + 32]; +#if WAVE_SIZE < 32 + } barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 16) + { +#endif + smem[tid] += smem[tid + 16]; +#if WAVE_SIZE < 16 + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 8) + { +#endif + smem[tid] += smem[tid + 8]; + smem[tid] += smem[tid + 4]; + smem[tid] += smem[tid + 2]; + smem[tid] += smem[tid + 1]; + } +} +#endif + +__kernel void calcLut(__global __const uchar * src, __global uchar * lut, + const int srcStep, const int dstStep, + const int2 tileSize, const int tilesX, + const int clipLimit, const float lutScale, + const int src_offset, const int dst_offset) +{ + __local int smem[512]; + + const int tx = get_group_id(0); + const int ty = get_group_id(1); + const unsigned int tid = get_local_id(1) * get_local_size(0) + + get_local_id(0); + + smem[tid] = 0; + barrier(CLK_LOCAL_MEM_FENCE); + + for (int i = get_local_id(1); i < tileSize.y; i += get_local_size(1)) + { + __global const uchar* srcPtr = src + mad24(ty * tileSize.y + i, srcStep, tx * tileSize.x + src_offset); + for (int j = get_local_id(0); j < tileSize.x; j += get_local_size(0)) + { + const int data = srcPtr[j]; + atomic_inc(&smem[data]); + } + } + barrier(CLK_LOCAL_MEM_FENCE); + + int tHistVal = smem[tid]; + barrier(CLK_LOCAL_MEM_FENCE); + + if (clipLimit > 0) + { + // clip histogram bar + int clipped = 0; + if (tHistVal > clipLimit) + { + clipped = tHistVal - clipLimit; + tHistVal = clipLimit; + } + + // find number of overall clipped samples + reduce(smem, clipped, tid); + barrier(CLK_LOCAL_MEM_FENCE); +#ifdef CPU + clipped = smem[256]; +#else + clipped = smem[0]; +#endif + + // broadcast evaluated value + + __local int totalClipped; + + if (tid == 0) + totalClipped = clipped; + barrier(CLK_LOCAL_MEM_FENCE); + + // redistribute clipped samples evenly + + int redistBatch = totalClipped / 256; + tHistVal += redistBatch; + + int residual = totalClipped - redistBatch * 256; + if (tid < residual) + ++tHistVal; + } + + const int lutVal = calc_lut(smem, tHistVal, tid); + uint ires = (uint)convert_int_rte(lutScale * lutVal); + lut[(ty * tilesX + tx) * dstStep + tid + dst_offset] = + convert_uchar(clamp(ires, (uint)0, (uint)255)); +} + +__kernel void transform(__global __const uchar * src, + __global uchar * dst, + __global uchar * lut, + const int srcStep, const int dstStep, const int lutStep, + const int cols, const int rows, + const int2 tileSize, + const int tilesX, const int tilesY, + const int src_offset, const int dst_offset, int lut_offset) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + + if (x >= cols || y >= rows) + return; + + const float tyf = (convert_float(y) / tileSize.y) - 0.5f; + int ty1 = convert_int_rtn(tyf); + int ty2 = ty1 + 1; + const float ya = tyf - ty1; + ty1 = max(ty1, 0); + ty2 = min(ty2, tilesY - 1); + + const float txf = (convert_float(x) / tileSize.x) - 0.5f; + int tx1 = convert_int_rtn(txf); + int tx2 = tx1 + 1; + const float xa = txf - tx1; + tx1 = max(tx1, 0); + tx2 = min(tx2, tilesX - 1); + + const int srcVal = src[mad24(y, srcStep, x + src_offset)]; + + float res = 0; + + res += lut[mad24(ty1 * tilesX + tx1, lutStep, srcVal + lut_offset)] * ((1.0f - xa) * (1.0f - ya)); + res += lut[mad24(ty1 * tilesX + tx2, lutStep, srcVal + lut_offset)] * ((xa) * (1.0f - ya)); + res += lut[mad24(ty2 * tilesX + tx1, lutStep, srcVal + lut_offset)] * ((1.0f - xa) * (ya)); + res += lut[mad24(ty2 * tilesX + tx2, lutStep, srcVal + lut_offset)] * ((xa) * (ya)); + + uint ires = (uint)convert_int_rte(res); + dst[mad24(y, dstStep, x + dst_offset)] = convert_uchar(clamp(ires, (uint)0, (uint)255)); +} diff --git a/modules/imgproc/src/opencl/convolve.cl b/modules/imgproc/src/opencl/convolve.cl new file mode 100644 index 0000000000..fb9596e5d6 --- /dev/null +++ b/modules/imgproc/src/opencl/convolve.cl @@ -0,0 +1,109 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Jiang Liyuan, jlyuan001.good@163.com +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors as is and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if defined (__ATI__) +#pragma OPENCL EXTENSION cl_amd_fp64:enable +#elif defined (__NVIDIA__) +#pragma OPENCL EXTENSION cl_khr_fp64:enable +#endif + +/************************************** convolve **************************************/ + +__kernel void convolve_D5(__global float *src, __global float *temp1, __global float *dst, + int rows, int cols, int src_step, int dst_step,int k_step, int kWidth, int kHeight, + int src_offset, int dst_offset, int koffset) +{ + __local float smem[16 + 2 * 8][16 + 2 * 8]; + + int x = get_local_id(0); + int y = get_local_id(1); + int gx = get_global_id(0); + int gy = get_global_id(1); + + // x | x 0 | 0 + // ----------- + // x | x 0 | 0 + // 0 | 0 0 | 0 + // ----------- + // 0 | 0 0 | 0 + smem[y][x] = src[min(max(gy - 8, 0), rows - 1) * src_step + min(max(gx - 8, 0), cols - 1) + src_offset]; + + // 0 | 0 x | x + // ----------- + // 0 | 0 x | x + // 0 | 0 0 | 0 + // ----------- + // 0 | 0 0 | 0 + smem[y][x + 16] = src[min(max(gy - 8, 0), rows - 1) * src_step + min(gx + 8, cols - 1) + src_offset]; + + // 0 | 0 0 | 0 + // ----------- + // 0 | 0 0 | 0 + // x | x 0 | 0 + // ----------- + // x | x 0 | 0 + smem[y + 16][x] = src[min(gy + 8, rows - 1) * src_step + min(max(gx - 8, 0), cols - 1) + src_offset]; + + // 0 | 0 0 | 0 + // ----------- + // 0 | 0 0 | 0 + // 0 | 0 x | x + // ----------- + // 0 | 0 x | x + smem[y + 16][x + 16] = src[min(gy + 8, rows - 1) * src_step + min(gx + 8, cols - 1) + src_offset]; + + barrier(CLK_LOCAL_MEM_FENCE); + + if (gx < cols && gy < rows) + { + float res = 0; + + for (int i = 0; i < kHeight; ++i) + for (int j = 0; j < kWidth; ++j) + res += smem[y + 8 - kHeight / 2 + i][x + 8 - kWidth / 2 + j] * temp1[i * k_step + j + koffset]; + + dst[gy * dst_step + gx + dst_offset] = res; + } +} diff --git a/modules/imgproc/src/opencl/copymakeborder.cl b/modules/imgproc/src/opencl/copymakeborder.cl new file mode 100644 index 0000000000..d97f660688 --- /dev/null +++ b/modules/imgproc/src/opencl/copymakeborder.cl @@ -0,0 +1,134 @@ +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Niko Li, newlife20080214@gmail.com +// Zero Lin zero.lin@amd.com +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors as is and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +// + +#if defined (DOUBLE_SUPPORT) +#ifdef cl_amd_fp64 +#pragma OPENCL EXTENSION cl_amd_fp64:enable +#elif defined (cl_khr_fp64) +#pragma OPENCL EXTENSION cl_khr_fp64:enable +#endif +#endif + +#ifdef BORDER_CONSTANT +#define EXTRAPOLATE(x, y, v) v = scalar; +#elif defined BORDER_REPLICATE +#define EXTRAPOLATE(x, y, v) \ + { \ + x = max(min(x, src_cols - 1), 0); \ + y = max(min(y, src_rows - 1), 0); \ + v = src[mad24(y, src_step, x + src_offset)]; \ + } +#elif defined BORDER_WRAP +#define EXTRAPOLATE(x, y, v) \ + { \ + if (x < 0) \ + x -= ((x - src_cols + 1) / src_cols) * src_cols; \ + if (x >= src_cols) \ + x %= src_cols; \ + \ + if (y < 0) \ + y -= ((y - src_rows + 1) / src_rows) * src_rows; \ + if( y >= src_rows ) \ + y %= src_rows; \ + v = src[mad24(y, src_step, x + src_offset)]; \ + } +#elif defined(BORDER_REFLECT) || defined(BORDER_REFLECT_101) +#ifdef BORDER_REFLECT +#define DELTA int delta = 0 +#else +#define DELTA int delta = 1 +#endif +#define EXTRAPOLATE(x, y, v) \ + { \ + DELTA; \ + if (src_cols == 1) \ + x = 0; \ + else \ + do \ + { \ + if( x < 0 ) \ + x = -x - 1 + delta; \ + else \ + x = src_cols - 1 - (x - src_cols) - delta; \ + } \ + while (x >= src_cols || x < 0); \ + \ + if (src_rows == 1) \ + y = 0; \ + else \ + do \ + { \ + if( y < 0 ) \ + y = -y - 1 + delta; \ + else \ + y = src_rows - 1 - (y - src_rows) - delta; \ + } \ + while (y >= src_rows || y < 0); \ + v = src[mad24(y, src_step, x + src_offset)]; \ + } +#else +#error No extrapolation method +#endif + +#define NEED_EXTRAPOLATION(gx, gy) (gx >= src_cols || gy >= src_rows || gx < 0 || gy < 0) + +__kernel void copymakeborder + (__global const GENTYPE *src, + __global GENTYPE *dst, + int dst_cols, int dst_rows, + int src_cols, int src_rows, + int src_step, int src_offset, + int dst_step, int dst_offset, + int top, int left, GENTYPE scalar) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + if (x < dst_cols && y < dst_rows) + { + int src_x = x - left; + int src_y = y - top; + int dst_index = mad24(y, dst_step, x + dst_offset); + + if (NEED_EXTRAPOLATION(src_x, src_y)) + EXTRAPOLATE(src_x, src_y, dst[dst_index]) + else + { + int src_index = mad24(src_y, src_step, src_x + src_offset); + dst[dst_index] = src[src_index]; + } + } +} diff --git a/modules/imgproc/src/opencl/cvtcolor.cl b/modules/imgproc/src/opencl/cvtcolor.cl new file mode 100644 index 0000000000..9ca98b0b91 --- /dev/null +++ b/modules/imgproc/src/opencl/cvtcolor.cl @@ -0,0 +1,306 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Jia Haipeng, jiahaipeng95@gmail.com +// Peng Xiao, pengxiao@multicorewareinc.com +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors as is and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +/**************************************PUBLICFUNC*************************************/ + +#if defined (DOUBLE_SUPPORT) +#pragma OPENCL EXTENSION cl_khr_fp64:enable +#endif + +#if depth == 0 + #define DATA_TYPE uchar + #define MAX_NUM 255 + #define HALF_MAX 128 + #define SAT_CAST(num) convert_uchar_sat(num) + #define DEPTH_0 +#elif depth == 2 + #define DATA_TYPE ushort + #define MAX_NUM 65535 + #define HALF_MAX 32768 + #define SAT_CAST(num) convert_ushort_sat(num) + #define DEPTH_2 +#elif depth == 5 + #define DATA_TYPE float + #define MAX_NUM 1.0f + #define HALF_MAX 0.5f + #define SAT_CAST(num) (num) + #define DEPTH_5 +#else + #error "invalid depth: should be 0 (CV_8U), 2 (CV_16U) or 5 (CV_32F)" +#endif + +#define CV_DESCALE(x,n) (((x) + (1 << ((n)-1))) >> (n)) + +enum +{ + yuv_shift = 14, + xyz_shift = 12, + R2Y = 4899, + G2Y = 9617, + B2Y = 1868, + BLOCK_SIZE = 256 +}; + +#define scnbytes ((int)sizeof(DATA_TYPE)*scn) +#define dcnbytes ((int)sizeof(DATA_TYPE)*dcn) + +///////////////////////////////////// RGB <-> GRAY ////////////////////////////////////// + +__kernel void RGB2Gray(__global const uchar* srcptr, int srcstep, int srcoffset, + __global uchar* dstptr, int dststep, int dstoffset, + int rows, int cols) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + + if (y < rows && x < cols) + { + const DATA_TYPE* src = (const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes)); + DATA_TYPE* dst = (DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes)); +#if defined (DEPTH_5) + dst[0] = src[bidx] * 0.114f + src[1] * 0.587f + src[(bidx^2)] * 0.299f; +#else + dst[0] = (DATA_TYPE)CV_DESCALE((src[bidx] * B2Y + src[1] * G2Y + src[(bidx^2)] * R2Y), yuv_shift); +#endif + } +} + +__kernel void Gray2RGB(__global const uchar* srcptr, int srcstep, int srcoffset, + __global uchar* dstptr, int dststep, int dstoffset, + int rows, int cols) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + + if (y < rows && x < cols) + { + const DATA_TYPE* src = (const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes)); + DATA_TYPE* dst = (DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes)); + DATA_TYPE val = src[0]; + dst[0] = dst[1] = dst[2] = val; +#if dcn == 4 + dst[3] = MAX_NUM; +#endif + } +} + +///////////////////////////////////// RGB <-> YUV ////////////////////////////////////// + +__constant float c_RGB2YUVCoeffs_f[5] = { 0.114f, 0.587f, 0.299f, 0.492f, 0.877f }; +__constant int c_RGB2YUVCoeffs_i[5] = { B2Y, G2Y, R2Y, 8061, 14369 }; + +__kernel void RGB2YUV(__global const uchar* srcptr, int srcstep, int srcoffset, + __global uchar* dstptr, int dststep, int dstoffset, + int rows, int cols) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + if (y < rows && x < cols) + { + const DATA_TYPE* src = (const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes)); + DATA_TYPE* dst = (DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes)); + DATA_TYPE b=src[bidx], g=src[1], r=src[bidx^2]; + +#if defined (DEPTH_5) + __constant float * coeffs = c_RGB2YUVCoeffs_f; + const DATA_TYPE Y = b * coeffs[0] + g * coeffs[1] + r * coeffs[2]; + const DATA_TYPE U = (b - Y) * coeffs[3] + HALF_MAX; + const DATA_TYPE V = (r - Y) * coeffs[4] + HALF_MAX; +#else + __constant int * coeffs = c_RGB2YUVCoeffs_i; + const int delta = HALF_MAX * (1 << yuv_shift); + const int Y = CV_DESCALE(b * coeffs[0] + g * coeffs[1] + r * coeffs[2], yuv_shift); + const int U = CV_DESCALE((b - Y) * coeffs[3] + delta, yuv_shift); + const int V = CV_DESCALE((r - Y) * coeffs[4] + delta, yuv_shift); +#endif + + dst[0] = SAT_CAST( Y ); + dst[1] = SAT_CAST( U ); + dst[2] = SAT_CAST( V ); + } +} + +__constant float c_YUV2RGBCoeffs_f[5] = { 2.032f, -0.395f, -0.581f, 1.140f }; +__constant int c_YUV2RGBCoeffs_i[5] = { 33292, -6472, -9519, 18678 }; + +__kernel void YUV2RGB(__global const uchar* srcptr, int srcstep, int srcoffset, + __global uchar* dstptr, int dststep, int dstoffset, + int rows, int cols) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + if (y < rows && x < cols) + { + const DATA_TYPE* src = (const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes)); + DATA_TYPE* dst = (DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes)); + DATA_TYPE Y = src[0], U = src[1], V = src[2]; + +#if defined (DEPTH_5) + __constant float * coeffs = c_YUV2RGBCoeffs_f; + const float r = Y + (V - HALF_MAX) * coeffs[3]; + const float g = Y + (V - HALF_MAX) * coeffs[2] + (U - HALF_MAX) * coeffs[1]; + const float b = Y + (U - HALF_MAX) * coeffs[0]; +#else + __constant int * coeffs = c_YUV2RGBCoeffs_i; + const int r = Y + CV_DESCALE((V - HALF_MAX) * coeffs[3], yuv_shift); + const int g = Y + CV_DESCALE((V - HALF_MAX) * coeffs[2] + (U - HALF_MAX) * coeffs[1], yuv_shift); + const int b = Y + CV_DESCALE((U - HALF_MAX) * coeffs[0], yuv_shift); +#endif + + dst[bidx] = SAT_CAST( b ); + dst[1] = SAT_CAST( g ); + dst[bidx^2] = SAT_CAST( r ); +#if dcn == 4 + dst[3] = MAX_NUM; +#endif + } +} + +__constant int ITUR_BT_601_CY = 1220542; +__constant int ITUR_BT_601_CUB = 2116026; +__constant int ITUR_BT_601_CUG = 409993; +__constant int ITUR_BT_601_CVG = 852492; +__constant int ITUR_BT_601_CVR = 1673527; +__constant int ITUR_BT_601_SHIFT = 20; + +__kernel void YUV2RGBA_NV12(__global const uchar* srcptr, int srcstep, int srcoffset, + __global uchar* dstptr, int dststep, int dstoffset, + int rows, int cols) +{ + const int x = get_global_id(0); // max_x = width / 2 + const int y = get_global_id(1); // max_y = height/ 2 + + if (y < rows / 2 && x < cols / 2 ) + { + __global const uchar* ysrc = srcptr + mad24(y << 1, srcstep, (x << 1) + srcoffset); + __global const uchar* usrc = srcptr + mad24(rows + y, srcstep, (x << 1) + srcoffset); + __global uchar* dst1 = dstptr + mad24(y << 1, dststep, x*(dcn*2) + dstoffset); + __global uchar* dst2 = dstptr + mad24((y << 1) + 1, dststep, x*(dcn*2) + dstoffset); + + int Y1 = ysrc[0]; + int Y2 = ysrc[1]; + int Y3 = ysrc[srcstep]; + int Y4 = ysrc[srcstep + 1]; + + int U = usrc[0] - 128; + int V = usrc[1] - 128; + + int ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * V; + int guv = (1 << (ITUR_BT_601_SHIFT - 1)) - ITUR_BT_601_CVG * V - ITUR_BT_601_CUG * U; + int buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * U; + + Y1 = max(0, Y1 - 16) * ITUR_BT_601_CY; + dst1[2 - bidx] = convert_uchar_sat((Y1 + ruv) >> ITUR_BT_601_SHIFT); + dst1[1] = convert_uchar_sat((Y1 + guv) >> ITUR_BT_601_SHIFT); + dst1[bidx] = convert_uchar_sat((Y1 + buv) >> ITUR_BT_601_SHIFT); +#if dcn == 4 + dst1[3] = 255; +#endif + + Y2 = max(0, Y2 - 16) * ITUR_BT_601_CY; + dst1[(dcn + 2) - bidx] = convert_uchar_sat((Y2 + ruv) >> ITUR_BT_601_SHIFT); + dst1[dcn + 1] = convert_uchar_sat((Y2 + guv) >> ITUR_BT_601_SHIFT); + dst1[dcn + bidx] = convert_uchar_sat((Y2 + buv) >> ITUR_BT_601_SHIFT); +#if dcn == 4 + dst1[7] = 255; +#endif + + Y3 = max(0, Y3 - 16) * ITUR_BT_601_CY; + dst2[2 - bidx] = convert_uchar_sat((Y3 + ruv) >> ITUR_BT_601_SHIFT); + dst2[1] = convert_uchar_sat((Y3 + guv) >> ITUR_BT_601_SHIFT); + dst2[bidx] = convert_uchar_sat((Y3 + buv) >> ITUR_BT_601_SHIFT); +#if dcn == 4 + dst2[3] = 255; +#endif + + Y4 = max(0, Y4 - 16) * ITUR_BT_601_CY; + dst2[(dcn + 2) - bidx] = convert_uchar_sat((Y4 + ruv) >> ITUR_BT_601_SHIFT); + dst2[dcn + 1] = convert_uchar_sat((Y4 + guv) >> ITUR_BT_601_SHIFT); + dst2[dcn + bidx] = convert_uchar_sat((Y4 + buv) >> ITUR_BT_601_SHIFT); +#if dcn == 4 + dst2[7] = 255; +#endif + } +} + +///////////////////////////////////// RGB <-> YUV ////////////////////////////////////// + +__constant float c_RGB2YCrCbCoeffs_f[5] = {0.299f, 0.587f, 0.114f, 0.713f, 0.564f}; +__constant int c_RGB2YCrCbCoeffs_i[5] = {R2Y, G2Y, B2Y, 11682, 9241}; + +__kernel void RGB2YCrCb(__global const uchar* srcptr, int srcstep, int srcoffset, + __global uchar* dstptr, int dststep, int dstoffset, + int rows, int cols) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + if (y < rows && x < cols) + { + const DATA_TYPE* src = (const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes)); + DATA_TYPE* dst = (DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes)); + DATA_TYPE b=src[bidx], g=src[1], r=src[bidx^2]; + +#if defined (DEPTH_5) + __constant float * coeffs = c_RGB2YCrCbCoeffs_f; + const DATA_TYPE Y = b * coeffs[0] + g * coeffs[1] + r * coeffs[2]; + const DATA_TYPE Cr = (r - Y) * coeffs[3] + HALF_MAX; + const DATA_TYPE Cb = (b - Y) * coeffs[4] + HALF_MAX; +#else + __constant int * coeffs = c_RGB2YCrCbCoeffs_i; + const int delta = HALF_MAX * (1 << yuv_shift); + const int Y = CV_DESCALE(b * coeffs[0] + g * coeffs[1] + r * coeffs[2], yuv_shift); + const int Cr = CV_DESCALE((r - Y) * coeffs[3] + delta, yuv_shift); + const int Cb = CV_DESCALE((b - Y) * coeffs[4] + delta, yuv_shift); +#endif + + dst[0] = SAT_CAST( Y ); + dst[1] = SAT_CAST( Cr ); + dst[2] = SAT_CAST( Cb ); + } +} diff --git a/modules/imgproc/src/opencl/gftt.cl b/modules/imgproc/src/opencl/gftt.cl new file mode 100644 index 0000000000..80bdec08ff --- /dev/null +++ b/modules/imgproc/src/opencl/gftt.cl @@ -0,0 +1,275 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Peng Xiao, pengxiao@outlook.com +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors as is and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef WITH_MASK +#define WITH_MASK 0 +#endif + +__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST; + +inline float ELEM_INT2(image2d_t _eig, int _x, int _y) +{ + return read_imagef(_eig, sampler, (int2)(_x, _y)).x; +} + +inline float ELEM_FLT2(image2d_t _eig, float2 pt) +{ + return read_imagef(_eig, sampler, pt).x; +} + +__kernel + void findCorners + ( + image2d_t eig, + __global const char * mask, + __global float2 * corners, + const int mask_strip,// in pixels + const float threshold, + const int rows, + const int cols, + const int max_count, + __global int * g_counter + ) +{ + const int j = get_global_id(0); + const int i = get_global_id(1); + + if (i > 0 && i < rows - 1 && j > 0 && j < cols - 1 +#if WITH_MASK + && mask[i * mask_strip + j] != 0 +#endif + ) + { + const float val = ELEM_INT2(eig, j, i); + + if (val > threshold) + { + float maxVal = val; + + maxVal = fmax(ELEM_INT2(eig, j - 1, i - 1), maxVal); + maxVal = fmax(ELEM_INT2(eig, j , i - 1), maxVal); + maxVal = fmax(ELEM_INT2(eig, j + 1, i - 1), maxVal); + + maxVal = fmax(ELEM_INT2(eig, j - 1, i), maxVal); + maxVal = fmax(ELEM_INT2(eig, j + 1, i), maxVal); + + maxVal = fmax(ELEM_INT2(eig, j - 1, i + 1), maxVal); + maxVal = fmax(ELEM_INT2(eig, j , i + 1), maxVal); + maxVal = fmax(ELEM_INT2(eig, j + 1, i + 1), maxVal); + + if (val == maxVal) + { + const int ind = atomic_inc(g_counter); + + if (ind < max_count) + corners[ind] = (float2)(j, i); + } + } + } +} + +//bitonic sort +__kernel + void sortCorners_bitonicSort + ( + image2d_t eig, + __global float2 * corners, + const int count, + const int stage, + const int passOfStage + ) +{ + const int threadId = get_global_id(0); + if(threadId >= count / 2) + { + return; + } + + const int sortOrder = (((threadId/(1 << stage)) % 2)) == 1 ? 1 : 0; // 0 is descent + + const int pairDistance = 1 << (stage - passOfStage); + const int blockWidth = 2 * pairDistance; + + const int leftId = min( (threadId % pairDistance) + + (threadId / pairDistance) * blockWidth, count ); + + const int rightId = min( leftId + pairDistance, count ); + + const float2 leftPt = corners[leftId]; + const float2 rightPt = corners[rightId]; + + const float leftVal = ELEM_FLT2(eig, leftPt); + const float rightVal = ELEM_FLT2(eig, rightPt); + + const bool compareResult = leftVal > rightVal; + + float2 greater = compareResult ? leftPt:rightPt; + float2 lesser = compareResult ? rightPt:leftPt; + + corners[leftId] = sortOrder ? lesser : greater; + corners[rightId] = sortOrder ? greater : lesser; +} + +//selection sort for gfft +//kernel is ported from Bolt library: +//https://github.com/HSA-Libraries/Bolt/blob/master/include/bolt/cl/sort_kernels.cl +// Local sort will firstly sort elements of each workgroup using selection sort +// its performance is O(n) +__kernel + void sortCorners_selectionSortLocal + ( + image2d_t eig, + __global float2 * corners, + const int count, + __local float2 * scratch + ) +{ + int i = get_local_id(0); // index in workgroup + int numOfGroups = get_num_groups(0); // index in workgroup + int groupID = get_group_id(0); + int wg = get_local_size(0); // workgroup size = block size + int n; // number of elements to be processed for this work group + + int offset = groupID * wg; + int same = 0; + corners += offset; + n = (groupID == (numOfGroups-1))? (count - wg*(numOfGroups-1)) : wg; + float2 pt1, pt2; + + pt1 = corners[min(i, n)]; + scratch[i] = pt1; + barrier(CLK_LOCAL_MEM_FENCE); + + if(i >= n) + { + return; + } + + float val1 = ELEM_FLT2(eig, pt1); + float val2; + + int pos = 0; + for (int j=0;j val1) + pos++;//calculate the rank of this element in this work group + else + { + if(val1 > val2) + continue; + else + { + // val1 and val2 are same + same++; + } + } + } + for (int j=0; j< same; j++) + corners[pos + j] = pt1; +} +__kernel + void sortCorners_selectionSortFinal + ( + image2d_t eig, + __global float2 * corners, + const int count + ) +{ + const int i = get_local_id(0); // index in workgroup + const int numOfGroups = get_num_groups(0); // index in workgroup + const int groupID = get_group_id(0); + const int wg = get_local_size(0); // workgroup size = block size + int pos = 0, same = 0; + const int offset = get_group_id(0) * wg; + const int remainder = count - wg*(numOfGroups-1); + + if((offset + i ) >= count) + return; + float2 pt1, pt2; + pt1 = corners[groupID*wg + i]; + + float val1 = ELEM_FLT2(eig, pt1); + float val2; + + for(int j=0; j val2) + break; + else + { + //Increment only if the value is not the same. + if( val2 > val1 ) + pos++; + else + same++; + } + } + } + + for(int k=0; k val2) + break; + else + { + //Don't increment if the value is the same. + //Two elements are same if (*userComp)(jData, iData) and (*userComp)(iData, jData) are both false + if(val2 > val1) + pos++; + else + same++; + } + } + for (int j=0; j< same; j++) + corners[pos + j] = pt1; +} diff --git a/modules/imgproc/src/opencl/harris.cl b/modules/imgproc/src/opencl/harris.cl new file mode 100644 index 0000000000..cac0b2cd30 --- /dev/null +++ b/modules/imgproc/src/opencl/harris.cl @@ -0,0 +1,202 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Shengen Yan,yanshengen@gmail.com +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors as is and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if defined (DOUBLE_SUPPORT) +#pragma OPENCL EXTENSION cl_khr_fp64:enable +#endif +/////////////////////////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////Macro for border type//////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////////////////////////////// +#ifdef BORDER_REPLICATE +//BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh +#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (l_edge) : (i)) +#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (r_edge)-1 : (addr)) +#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) :(i)) +#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (b_edge)-1 :(addr)) +#endif + +#ifdef BORDER_REFLECT +//BORDER_REFLECT: fedcba|abcdefgh|hgfedcb +#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i)-1 : (i)) +#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr)) +#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i)-1 : (i)) +#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr)) +#endif + +#ifdef BORDER_REFLECT101 +//BORDER_REFLECT101: gfedcb|abcdefgh|gfedcba +#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i) : (i)) +#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr)) +#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i) : (i)) +#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr)) +#endif + +#ifdef BORDER_WRAP +//BORDER_WRAP: cdefgh|abcdefgh|abcdefg +#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (i)+(r_edge) : (i)) +#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (i)-(r_edge) : (addr)) +#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (i)+(b_edge) : (i)) +#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (i)-(b_edge) : (addr)) +#endif + +#define THREADS 256 +#define ELEM(i, l_edge, r_edge, elem1, elem2) (i) >= (l_edge) && (i) < (r_edge) ? (elem1) : (elem2) +/////////////////////////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////calcHarris//////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////////////////////////// +__kernel void calcHarris(__global const float *Dx,__global const float *Dy, __global float *dst, + int dx_offset, int dx_whole_rows, int dx_whole_cols, int dx_step, + int dy_offset, int dy_whole_rows, int dy_whole_cols, int dy_step, + int dst_offset, int dst_rows, int dst_cols, int dst_step, + float k) +{ + int col = get_local_id(0); + const int gX = get_group_id(0); + const int gY = get_group_id(1); + const int glx = get_global_id(0); + const int gly = get_global_id(1); + + int dx_x_off = (dx_offset % dx_step) >> 2; + int dx_y_off = dx_offset / dx_step; + int dy_x_off = (dy_offset % dy_step) >> 2; + int dy_y_off = dy_offset / dy_step; + int dst_x_off = (dst_offset % dst_step) >> 2; + int dst_y_off = dst_offset / dst_step; + + int dx_startX = gX * (THREADS-ksX+1) - anX + dx_x_off; + int dx_startY = (gY << 1) - anY + dx_y_off; + int dy_startX = gX * (THREADS-ksX+1) - anX + dy_x_off; + int dy_startY = (gY << 1) - anY + dy_y_off; + int dst_startX = gX * (THREADS-ksX+1) + dst_x_off; + int dst_startY = (gY << 1) + dst_y_off; + + float dx_data[ksY+1],dy_data[ksY+1],data[3][ksY+1]; + __local float temp[6][THREADS]; +#ifdef BORDER_CONSTANT + bool dx_con,dy_con; + float dx_s,dy_s; + for(int i=0; i < ksY+1; i++) + { + dx_con = dx_startX+col >= 0 && dx_startX+col < dx_whole_cols && dx_startY+i >= 0 && dx_startY+i < dx_whole_rows; + dx_s = Dx[(dx_startY+i)*(dx_step>>2)+(dx_startX+col)]; + dx_data[i] = dx_con ? dx_s : 0.0; + dy_con = dy_startX+col >= 0 && dy_startX+col < dy_whole_cols && dy_startY+i >= 0 && dy_startY+i < dy_whole_rows; + dy_s = Dy[(dy_startY+i)*(dy_step>>2)+(dy_startX+col)]; + dy_data[i] = dy_con ? dy_s : 0.0; + data[0][i] = dx_data[i] * dx_data[i]; + data[1][i] = dx_data[i] * dy_data[i]; + data[2][i] = dy_data[i] * dy_data[i]; + } +#else + int clamped_col = min(dst_cols, col); + for(int i=0; i < ksY+1; i++) + { + int dx_selected_row; + int dx_selected_col; + dx_selected_row = ADDR_H(dx_startY+i, 0, dx_whole_rows); + dx_selected_row = ADDR_B(dx_startY+i, dx_whole_rows, dx_selected_row); + dx_selected_col = ADDR_L(dx_startX+clamped_col, 0, dx_whole_cols); + dx_selected_col = ADDR_R(dx_startX+clamped_col, dx_whole_cols, dx_selected_col); + dx_data[i] = Dx[dx_selected_row * (dx_step>>2) + dx_selected_col]; + + int dy_selected_row; + int dy_selected_col; + dy_selected_row = ADDR_H(dy_startY+i, 0, dy_whole_rows); + dy_selected_row = ADDR_B(dy_startY+i, dy_whole_rows, dy_selected_row); + dy_selected_col = ADDR_L(dy_startX+clamped_col, 0, dy_whole_cols); + dy_selected_col = ADDR_R(dy_startX+clamped_col, dy_whole_cols, dy_selected_col); + dy_data[i] = Dy[dy_selected_row * (dy_step>>2) + dy_selected_col]; + + data[0][i] = dx_data[i] * dx_data[i]; + data[1][i] = dx_data[i] * dy_data[i]; + data[2][i] = dy_data[i] * dy_data[i]; + } +#endif + float sum0 = 0.0, sum1 = 0.0, sum2 = 0.0; + for(int i=1; i < ksY; i++) + { + sum0 += (data[0][i]); + sum1 += (data[1][i]); + sum2 += (data[2][i]); + } + float sum01,sum02,sum11,sum12,sum21,sum22; + sum01 = sum0 + (data[0][0]); + sum02 = sum0 + (data[0][ksY]); + temp[0][col] = sum01; + temp[1][col] = sum02; + sum11 = sum1 + (data[1][0]); + sum12 = sum1 + (data[1][ksY]); + temp[2][col] = sum11; + temp[3][col] = sum12; + sum21 = sum2 + (data[2][0]); + sum22 = sum2 + (data[2][ksY]); + temp[4][col] = sum21; + temp[5][col] = sum22; + barrier(CLK_LOCAL_MEM_FENCE); + if(col < (THREADS-(ksX-1))) + { + col += anX; + int posX = dst_startX - dst_x_off + col - anX; + int posY = (gly << 1); + int till = (ksX + 1)%2; + float tmp_sum[6]={ 0.0, 0.0 , 0.0, 0.0, 0.0, 0.0 }; + for(int k=0; k<6; k++) + for(int i=-anX; i<=anX - till; i++) + { + tmp_sum[k] += temp[k][col+i]; + } + + if(posX < dst_cols && (posY) < dst_rows) + { + dst[(dst_startY+0) * (dst_step>>2)+ dst_startX + col - anX] = + tmp_sum[0] * tmp_sum[4] - tmp_sum[2] * tmp_sum[2] - k * (tmp_sum[0] + tmp_sum[4]) * (tmp_sum[0] + tmp_sum[4]); + } + if(posX < dst_cols && (posY + 1) < dst_rows) + { + dst[(dst_startY+1) * (dst_step>>2)+ dst_startX + col - anX] = + tmp_sum[1] * tmp_sum[5] - tmp_sum[3] * tmp_sum[3] - k * (tmp_sum[1] + tmp_sum[5]) * (tmp_sum[1] + tmp_sum[5]); + } + } +} diff --git a/modules/imgproc/src/opencl/histogram.cl b/modules/imgproc/src/opencl/histogram.cl new file mode 100644 index 0000000000..bac9a6b899 --- /dev/null +++ b/modules/imgproc/src/opencl/histogram.cl @@ -0,0 +1,279 @@ +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Niko Li, newlife20080214@gmail.com +// Jia Haipeng, jiahaipeng95@gmail.com +// Xu Pang, pangxu010@163.com +// Wenju He, wenju@multicorewareinc.com +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors as is and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +// +#define PARTIAL_HISTOGRAM256_COUNT (256) +#define HISTOGRAM256_BIN_COUNT (256) + +#define HISTOGRAM256_WORK_GROUP_SIZE (256) +#define HISTOGRAM256_LOCAL_MEM_SIZE (HISTOGRAM256_BIN_COUNT) + +#define NBANKS (16) +#define NBANKS_BIT (4) + + +__kernel __attribute__((reqd_work_group_size(HISTOGRAM256_BIN_COUNT,1,1)))void calc_sub_hist_D0( + __global const uint4* src, + int src_step, int src_offset, + __global int* globalHist, + int dataCount, int cols, + int inc_x, int inc_y, + int hist_step) +{ + __local int subhist[(HISTOGRAM256_BIN_COUNT << NBANKS_BIT)]; // NBINS*NBANKS + int gid = get_global_id(0); + int lid = get_local_id(0); + int gx = get_group_id(0); + int gsize = get_global_size(0); + int lsize = get_local_size(0); + const int shift = 8; + const int mask = HISTOGRAM256_BIN_COUNT-1; + int offset = (lid & (NBANKS-1));// lid % NBANKS + uint4 data, temp1, temp2, temp3, temp4; + src += src_offset; + + //clear LDS + for(int i=0, idx=lid; i<(NBANKS >> 2); i++, idx += lsize) + { + subhist[idx] = 0; + subhist[idx+=lsize] = 0; + subhist[idx+=lsize] = 0; + subhist[idx+=lsize] = 0; + } + barrier(CLK_LOCAL_MEM_FENCE); + + //read and scatter + int y = gid/cols; + int x = gid - mul24(y, cols); + for(int idx=gid; idx>= shift; + temp2 = ((data & mask) << NBANKS_BIT) + offset; + data >>= shift; + temp3 = ((data & mask) << NBANKS_BIT) + offset; + data >>= shift; + temp4 = ((data & mask) << NBANKS_BIT) + offset; + + atomic_inc(subhist + temp1.x); + atomic_inc(subhist + temp1.y); + atomic_inc(subhist + temp1.z); + atomic_inc(subhist + temp1.w); + + atomic_inc(subhist + temp2.x); + atomic_inc(subhist + temp2.y); + atomic_inc(subhist + temp2.z); + atomic_inc(subhist + temp2.w); + + atomic_inc(subhist + temp3.x); + atomic_inc(subhist + temp3.y); + atomic_inc(subhist + temp3.z); + atomic_inc(subhist + temp3.w); + + atomic_inc(subhist + temp4.x); + atomic_inc(subhist + temp4.y); + atomic_inc(subhist + temp4.z); + atomic_inc(subhist + temp4.w); + + x += inc_x; + int off = ((x>=cols) ? -1 : 0); + x = mad24(off, cols, x); + y += inc_y - off; + } + barrier(CLK_LOCAL_MEM_FENCE); + + //reduce local banks to single histogram per workgroup + int bin1=0, bin2=0, bin3=0, bin4=0; + for(int i=0; i=left_col) ? (gidx+cols) : gidx); + if(gidy= rows ? HISTOGRAM256_LOCAL_MEM_SIZE : p; + atomic_inc(subhist + p); + } + barrier(CLK_LOCAL_MEM_FENCE); + + globalHist[mad24(rowIndex, hist_step, lidy)] += subhist[lidy]; +} + +__kernel __attribute__((reqd_work_group_size(256,1,1)))void merge_hist(__global int* buf, + __global int* hist, + int src_step) +{ + int lx = get_local_id(0); + int gx = get_group_id(0); + + int sum = 0; + + for(int i = lx; i < PARTIAL_HISTOGRAM256_COUNT; i += HISTOGRAM256_WORK_GROUP_SIZE) + sum += buf[ mad24(i, src_step, gx)]; + + __local int data[HISTOGRAM256_WORK_GROUP_SIZE]; + data[lx] = sum; + + for(int stride = HISTOGRAM256_WORK_GROUP_SIZE /2; stride > 0; stride >>= 1) + { + barrier(CLK_LOCAL_MEM_FENCE); + if(lx < stride) + data[lx] += data[lx + stride]; + } + + if(lx == 0) + hist[gx] = data[0]; +} + +__kernel __attribute__((reqd_work_group_size(256,1,1))) +void calLUT(__global uchar * dst, __constant int * hist, int total) +{ + int lid = get_local_id(0); + __local int sumhist[HISTOGRAM256_BIN_COUNT]; + __local float scale; + + sumhist[lid] = hist[lid]; + barrier(CLK_LOCAL_MEM_FENCE); + if (lid == 0) + { + int sum = 0, i = 0; + while (!sumhist[i]) + ++i; + + if (total == sumhist[i]) + { + scale = 1; + for (int j = 0; j < HISTOGRAM256_BIN_COUNT; ++j) + sumhist[i] = i; + } + else + { + scale = 255.f/(total - sumhist[i]); + + for (sumhist[i++] = 0; i < HISTOGRAM256_BIN_COUNT; i++) + { + sum += sumhist[i]; + sumhist[i] = sum; + } + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + dst[lid]= convert_uchar_sat_rte(convert_float(sumhist[lid])*scale); +} + +/* +///////////////////////////////equalizeHist////////////////////////////////////////////////// +__kernel __attribute__((reqd_work_group_size(256,1,1)))void equalizeHist( + __global uchar * src, + __global uchar * dst, + __constant int * hist, + int srcstep, + int srcoffset, + int dststep, + int dstoffset, + int width, + int height, + float scale, + int inc_x, + int inc_y) +{ + int gidx = get_global_id(0); + int lid = get_local_id(0); + int glb_size = get_global_size(0); + src+=srcoffset; + dst+=dstoffset; + __local int sumhist[HISTOGRAM256_BIN_COUNT]; + __local uchar lut[HISTOGRAM256_BIN_COUNT+1]; + + sumhist[lid]=hist[lid]; + barrier(CLK_LOCAL_MEM_FENCE); + if(lid==0) + { + int sum = 0; + for(int i=0;i= width ? -1 : 0); + pos_x = mad24(off,width,pos_x); + pos_y += inc_y - off; + } +} +*/ diff --git a/modules/imgproc/src/opencl/hough.cl b/modules/imgproc/src/opencl/hough.cl new file mode 100644 index 0000000000..fd1c5b9a8a --- /dev/null +++ b/modules/imgproc/src/opencl/hough.cl @@ -0,0 +1,280 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or bpied warranties, including, but not limited to, the bpied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable +#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable + +//////////////////////////////////////////////////////////////////////// +// buildPointList + +#define PIXELS_PER_THREAD 16 + +// TODO: add offset to support ROI +__kernel void buildPointList(__global const uchar* src, + int cols, + int rows, + int step, + __global unsigned int* list, + __global int* counter) +{ + __local unsigned int s_queues[4][32 * PIXELS_PER_THREAD]; + __local int s_qsize[4]; + __local int s_globStart[4]; + + const int x = get_group_id(0) * get_local_size(0) * PIXELS_PER_THREAD + get_local_id(0); + const int y = get_global_id(1); + + if (get_local_id(0) == 0) + s_qsize[get_local_id(1)] = 0; + barrier(CLK_LOCAL_MEM_FENCE); + + if (y < rows) + { + // fill the queue + __global const uchar* srcRow = &src[y * step]; + for (int i = 0, xx = x; i < PIXELS_PER_THREAD && xx < cols; ++i, xx += get_local_size(0)) + { + if (srcRow[xx]) + { + const unsigned int val = (y << 16) | xx; + const int qidx = atomic_add(&s_qsize[get_local_id(1)], 1); + s_queues[get_local_id(1)][qidx] = val; + } + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + // let one work-item reserve the space required in the global list + if (get_local_id(0) == 0 && get_local_id(1) == 0) + { + // find how many items are stored in each list + int totalSize = 0; + for (int i = 0; i < get_local_size(1); ++i) + { + s_globStart[i] = totalSize; + totalSize += s_qsize[i]; + } + + // calculate the offset in the global list + const int globalOffset = atomic_add(counter, totalSize); + for (int i = 0; i < get_local_size(1); ++i) + s_globStart[i] += globalOffset; + } + + barrier(CLK_GLOBAL_MEM_FENCE); + + // copy local queues to global queue + const int qsize = s_qsize[get_local_id(1)]; + int gidx = s_globStart[get_local_id(1)] + get_local_id(0); + for(int i = get_local_id(0); i < qsize; i += get_local_size(0), gidx += get_local_size(0)) + list[gidx] = s_queues[get_local_id(1)][i]; +} + +//////////////////////////////////////////////////////////////////////// +// circlesAccumCenters + +// TODO: add offset to support ROI +__kernel void circlesAccumCenters(__global const unsigned int* list, + const int count, + __global const int* dx, + const int dxStep, + __global const int* dy, + const int dyStep, + __global int* accum, + const int accumStep, + const int width, + const int height, + const int minRadius, + const int maxRadius, + const float idp) +{ + const int dxStepInPixel = dxStep / sizeof(int); + const int dyStepInPixel = dyStep / sizeof(int); + const int accumStepInPixel = accumStep / sizeof(int); + + const int SHIFT = 10; + const int ONE = 1 << SHIFT; + + // const int tid = blockIdx.x * blockDim.x + threadIdx.x; + const int wid = get_global_id(0); + + if (wid >= count) + return; + + const unsigned int val = list[wid]; + + const int x = (val & 0xFFFF); + const int y = (val >> 16) & 0xFFFF; + + const int vx = dx[mad24(y, dxStepInPixel, x)]; + const int vy = dy[mad24(y, dyStepInPixel, x)]; + + if (vx == 0 && vy == 0) + return; + + const float mag = sqrt(convert_float(vx * vx + vy * vy)); + + const int x0 = convert_int_rte((x * idp) * ONE); + const int y0 = convert_int_rte((y * idp) * ONE); + + int sx = convert_int_rte((vx * idp) * ONE / mag); + int sy = convert_int_rte((vy * idp) * ONE / mag); + + // Step from minRadius to maxRadius in both directions of the gradient + for (int k1 = 0; k1 < 2; ++k1) + { + int x1 = x0 + minRadius * sx; + int y1 = y0 + minRadius * sy; + + for (int r = minRadius; r <= maxRadius; x1 += sx, y1 += sy, ++r) + { + const int x2 = x1 >> SHIFT; + const int y2 = y1 >> SHIFT; + + if (x2 < 0 || x2 >= width || y2 < 0 || y2 >= height) + break; + + atomic_add(&accum[mad24(y2+1, accumStepInPixel, x2+1)], 1); + } + + sx = -sx; + sy = -sy; + } +} + +// //////////////////////////////////////////////////////////////////////// +// // buildCentersList + +// TODO: add offset to support ROI +__kernel void buildCentersList(__global const int* accum, + const int accumCols, + const int accumRows, + const int accumStep, + __global unsigned int* centers, + const int threshold, + __global int* counter) +{ + const int accumStepInPixel = accumStep/sizeof(int); + + const int x = get_global_id(0); + const int y = get_global_id(1); + + if (x < accumCols - 2 && y < accumRows - 2) + { + const int top = accum[mad24(y, accumStepInPixel, x + 1)]; + + const int left = accum[mad24(y + 1, accumStepInPixel, x)]; + const int cur = accum[mad24(y + 1, accumStepInPixel, x + 1)]; + const int right = accum[mad24(y + 1, accumStepInPixel, x + 2)]; + + const int bottom = accum[mad24(y + 2, accumStepInPixel, x + 1)];; + + if (cur > threshold && cur > top && cur >= bottom && cur > left && cur >= right) + { + const unsigned int val = (y << 16) | x; + const int idx = atomic_add(counter, 1); + centers[idx] = val; + } + } +} + + +// //////////////////////////////////////////////////////////////////////// +// // circlesAccumRadius + +// TODO: add offset to support ROI +__kernel void circlesAccumRadius(__global const unsigned int* centers, + __global const unsigned int* list, const int count, + __global float4* circles, const int maxCircles, + const float dp, + const int minRadius, const int maxRadius, + const int histSize, + const int threshold, + __local int* smem, + __global int* counter) +{ + for (int i = get_local_id(0); i < histSize + 2; i += get_local_size(0)) + smem[i] = 0; + barrier(CLK_LOCAL_MEM_FENCE); + + unsigned int val = centers[get_group_id(0)]; + + float cx = convert_float(val & 0xFFFF); + float cy = convert_float((val >> 16) & 0xFFFF); + + cx = (cx + 0.5f) * dp; + cy = (cy + 0.5f) * dp; + + for (int i = get_local_id(0); i < count; i += get_local_size(0)) + { + val = list[i]; + + const int x = (val & 0xFFFF); + const int y = (val >> 16) & 0xFFFF; + + const float rad = sqrt((cx - x) * (cx - x) + (cy - y) * (cy - y)); + if (rad >= minRadius && rad <= maxRadius) + { + const int r = convert_int_rte(rad - minRadius); + + atomic_add(&smem[r + 1], 1); + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + for (int i = get_local_id(0); i < histSize; i += get_local_size(0)) + { + const int curVotes = smem[i + 1]; + + if (curVotes >= threshold && curVotes > smem[i] && curVotes >= smem[i + 2]) + + { + const int ind = atomic_add(counter, 1); + if (ind < maxCircles) + { + circles[ind] = (float4)(cx, cy, convert_float(i + minRadius), 0.0f); + } + } + } +} diff --git a/modules/imgproc/src/opencl/integral.cl b/modules/imgproc/src/opencl/integral.cl new file mode 100644 index 0000000000..f10b184e55 --- /dev/null +++ b/modules/imgproc/src/opencl/integral.cl @@ -0,0 +1,493 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Shengen Yan,yanshengen@gmail.com +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors as is and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if defined (DOUBLE_SUPPORT) +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64:enable +#elif defined (cl_amd_fp64) +#pragma OPENCL EXTENSION cl_amd_fp64:enable +#endif +#endif +#define LSIZE 256 +#define LSIZE_1 255 +#define LSIZE_2 254 +#define HF_LSIZE 128 +#define LOG_LSIZE 8 +#define LOG_NUM_BANKS 5 +#define NUM_BANKS 32 +#define GET_CONFLICT_OFFSET(lid) ((lid) >> LOG_NUM_BANKS) + + +kernel void integral_cols_D4(__global uchar4 *src,__global int *sum ,__global float *sqsum, + int src_offset,int pre_invalid,int rows,int cols,int src_step,int dst_step) +{ + unsigned int lid = get_local_id(0); + unsigned int gid = get_group_id(0); + int4 src_t[2], sum_t[2]; + float4 sqsum_t[2]; + __local int4 lm_sum[2][LSIZE + LOG_LSIZE]; + __local float4 lm_sqsum[2][LSIZE + LOG_LSIZE]; + __local int* sum_p; + __local float* sqsum_p; + src_step = src_step >> 2; + gid = gid << 1; + for(int i = 0; i < rows; i =i + LSIZE_1) + { + src_t[0] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + min(gid, (uint)cols - 1)]) : 0); + src_t[1] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + min(gid + 1, (uint)cols - 1)]) : 0); + + sum_t[0] = (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]); + sqsum_t[0] = (i == 0 ? (float4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]); + sum_t[1] = (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]); + sqsum_t[1] = (i == 0 ? (float4)0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]); + barrier(CLK_LOCAL_MEM_FENCE); + + int bf_loc = lid + GET_CONFLICT_OFFSET(lid); + lm_sum[0][bf_loc] = src_t[0]; + lm_sqsum[0][bf_loc] = convert_float4(src_t[0] * src_t[0]); + + lm_sum[1][bf_loc] = src_t[1]; + lm_sqsum[1][bf_loc] = convert_float4(src_t[1] * src_t[1]); + + int offset = 1; + for(int d = LSIZE >> 1 ; d > 0; d>>=1) + { + barrier(CLK_LOCAL_MEM_FENCE); + int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset; + ai += GET_CONFLICT_OFFSET(ai); + bi += GET_CONFLICT_OFFSET(bi); + + if((lid & 127) < d) + { + lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai]; + lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai]; + } + offset <<= 1; + } + barrier(CLK_LOCAL_MEM_FENCE); + if(lid < 2) + { + lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0; + lm_sqsum[lid][LSIZE_2 + LOG_LSIZE] = 0; + } + for(int d = 1; d < LSIZE; d <<= 1) + { + barrier(CLK_LOCAL_MEM_FENCE); + offset >>= 1; + int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset; + ai += GET_CONFLICT_OFFSET(ai); + bi += GET_CONFLICT_OFFSET(bi); + + if((lid & 127) < d) + { + lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai]; + lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai]; + + lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai]; + lm_sqsum[lid >> 7][ai] = lm_sqsum[lid >> 7][bi] - lm_sqsum[lid >> 7][ai]; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + int loc_s0 = gid * dst_step + i + lid - 1 - pre_invalid * dst_step / 4, loc_s1 = loc_s0 + dst_step ; + if(lid > 0 && (i+lid) <= rows) + { + lm_sum[0][bf_loc] += sum_t[0]; + lm_sum[1][bf_loc] += sum_t[1]; + lm_sqsum[0][bf_loc] += sqsum_t[0]; + lm_sqsum[1][bf_loc] += sqsum_t[1]; + sum_p = (__local int*)(&(lm_sum[0][bf_loc])); + sqsum_p = (__local float*)(&(lm_sqsum[0][bf_loc])); + for(int k = 0; k < 4; k++) + { + if(gid * 4 + k >= cols + pre_invalid || gid * 4 + k < pre_invalid) continue; + sum[loc_s0 + k * dst_step / 4] = sum_p[k]; + sqsum[loc_s0 + k * dst_step / 4] = sqsum_p[k]; + } + sum_p = (__local int*)(&(lm_sum[1][bf_loc])); + sqsum_p = (__local float*)(&(lm_sqsum[1][bf_loc])); + for(int k = 0; k < 4; k++) + { + if(gid * 4 + k + 4 >= cols + pre_invalid) break; + sum[loc_s1 + k * dst_step / 4] = sum_p[k]; + sqsum[loc_s1 + k * dst_step / 4] = sqsum_p[k]; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + } +} + + +kernel void integral_rows_D4(__global int4 *srcsum,__global float4 * srcsqsum,__global int *sum , + __global float *sqsum,int rows,int cols,int src_step,int sum_step, + int sqsum_step,int sum_offset,int sqsum_offset) +{ + unsigned int lid = get_local_id(0); + unsigned int gid = get_group_id(0); + int4 src_t[2], sum_t[2]; + float4 sqsrc_t[2],sqsum_t[2]; + __local int4 lm_sum[2][LSIZE + LOG_LSIZE]; + __local float4 lm_sqsum[2][LSIZE + LOG_LSIZE]; + __local int *sum_p; + __local float *sqsum_p; + src_step = src_step >> 4; + for(int i = 0; i < rows; i =i + LSIZE_1) + { + src_t[0] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2] : (int4)0; + sqsrc_t[0] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2] : (float4)0; + src_t[1] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2 + 1] : (int4)0; + sqsrc_t[1] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2 + 1] : (float4)0; + + sum_t[0] = (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]); + sqsum_t[0] = (i == 0 ? (float4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]); + sum_t[1] = (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]); + sqsum_t[1] = (i == 0 ? (float4)0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]); + barrier(CLK_LOCAL_MEM_FENCE); + + int bf_loc = lid + GET_CONFLICT_OFFSET(lid); + lm_sum[0][bf_loc] = src_t[0]; + lm_sqsum[0][bf_loc] = sqsrc_t[0]; + + lm_sum[1][bf_loc] = src_t[1]; + lm_sqsum[1][bf_loc] = sqsrc_t[1]; + + int offset = 1; + for(int d = LSIZE >> 1 ; d > 0; d>>=1) + { + barrier(CLK_LOCAL_MEM_FENCE); + int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset; + ai += GET_CONFLICT_OFFSET(ai); + bi += GET_CONFLICT_OFFSET(bi); + + if((lid & 127) < d) + { + lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai]; + lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai]; + } + offset <<= 1; + } + barrier(CLK_LOCAL_MEM_FENCE); + if(lid < 2) + { + lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0; + lm_sqsum[lid][LSIZE_2 + LOG_LSIZE] = 0; + } + for(int d = 1; d < LSIZE; d <<= 1) + { + barrier(CLK_LOCAL_MEM_FENCE); + offset >>= 1; + int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset; + ai += GET_CONFLICT_OFFSET(ai); + bi += GET_CONFLICT_OFFSET(bi); + + if((lid & 127) < d) + { + lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai]; + lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai]; + + lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai]; + lm_sqsum[lid >> 7][ai] = lm_sqsum[lid >> 7][bi] - lm_sqsum[lid >> 7][ai]; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + if(gid == 0 && (i + lid) <= rows) + { + sum[sum_offset + i + lid] = 0; + sqsum[sqsum_offset + i + lid] = 0; + } + if(i + lid == 0) + { + int loc0 = gid * 2 * sum_step; + int loc1 = gid * 2 * sqsum_step; + for(int k = 1; k <= 8; k++) + { + if(gid * 8 + k > cols) break; + sum[sum_offset + loc0 + k * sum_step / 4] = 0; + sqsum[sqsum_offset + loc1 + k * sqsum_step / 4] = 0; + } + } + int loc_s0 = sum_offset + gid * 2 * sum_step + sum_step / 4 + i + lid, loc_s1 = loc_s0 + sum_step ; + int loc_sq0 = sqsum_offset + gid * 2 * sqsum_step + sqsum_step / 4 + i + lid, loc_sq1 = loc_sq0 + sqsum_step ; + if(lid > 0 && (i+lid) <= rows) + { + lm_sum[0][bf_loc] += sum_t[0]; + lm_sum[1][bf_loc] += sum_t[1]; + lm_sqsum[0][bf_loc] += sqsum_t[0]; + lm_sqsum[1][bf_loc] += sqsum_t[1]; + sum_p = (__local int*)(&(lm_sum[0][bf_loc])); + sqsum_p = (__local float*)(&(lm_sqsum[0][bf_loc])); + for(int k = 0; k < 4; k++) + { + if(gid * 8 + k >= cols) break; + sum[loc_s0 + k * sum_step / 4] = sum_p[k]; + sqsum[loc_sq0 + k * sqsum_step / 4] = sqsum_p[k]; + } + sum_p = (__local int*)(&(lm_sum[1][bf_loc])); + sqsum_p = (__local float*)(&(lm_sqsum[1][bf_loc])); + for(int k = 0; k < 4; k++) + { + if(gid * 8 + 4 + k >= cols) break; + sum[loc_s1 + k * sum_step / 4] = sum_p[k]; + sqsum[loc_sq1 + k * sqsum_step / 4] = sqsum_p[k]; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + } +} + +kernel void integral_cols_D5(__global uchar4 *src,__global float *sum ,__global float *sqsum, + int src_offset,int pre_invalid,int rows,int cols,int src_step,int dst_step) +{ + unsigned int lid = get_local_id(0); + unsigned int gid = get_group_id(0); + float4 src_t[2], sum_t[2]; + float4 sqsum_t[2]; + __local float4 lm_sum[2][LSIZE + LOG_LSIZE]; + __local float4 lm_sqsum[2][LSIZE + LOG_LSIZE]; + __local float* sum_p; + __local float* sqsum_p; + src_step = src_step >> 2; + gid = gid << 1; + for(int i = 0; i < rows; i =i + LSIZE_1) + { + src_t[0] = (i + lid < rows ? convert_float4(src[src_offset + (lid+i) * src_step + min(gid, (uint)cols - 1)]) : (float4)0); + src_t[1] = (i + lid < rows ? convert_float4(src[src_offset + (lid+i) * src_step + min(gid + 1, (uint)cols - 1)]) : (float4)0); + + sum_t[0] = (i == 0 ? (float4)0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]); + sqsum_t[0] = (i == 0 ? (float4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]); + sum_t[1] = (i == 0 ? (float4)0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]); + sqsum_t[1] = (i == 0 ? (float4)0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]); + barrier(CLK_LOCAL_MEM_FENCE); + + int bf_loc = lid + GET_CONFLICT_OFFSET(lid); + lm_sum[0][bf_loc] = src_t[0]; + lm_sqsum[0][bf_loc] = convert_float4(src_t[0] * src_t[0]); + + lm_sum[1][bf_loc] = src_t[1]; + lm_sqsum[1][bf_loc] = convert_float4(src_t[1] * src_t[1]); + + int offset = 1; + for(int d = LSIZE >> 1 ; d > 0; d>>=1) + { + barrier(CLK_LOCAL_MEM_FENCE); + int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset; + ai += GET_CONFLICT_OFFSET(ai); + bi += GET_CONFLICT_OFFSET(bi); + + if((lid & 127) < d) + { + lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai]; + lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai]; + } + offset <<= 1; + } + barrier(CLK_LOCAL_MEM_FENCE); + if(lid < 2) + { + lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0; + lm_sqsum[lid][LSIZE_2 + LOG_LSIZE] = 0; + } + for(int d = 1; d < LSIZE; d <<= 1) + { + barrier(CLK_LOCAL_MEM_FENCE); + offset >>= 1; + int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset; + ai += GET_CONFLICT_OFFSET(ai); + bi += GET_CONFLICT_OFFSET(bi); + + if((lid & 127) < d) + { + lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai]; + lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai]; + + lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai]; + lm_sqsum[lid >> 7][ai] = lm_sqsum[lid >> 7][bi] - lm_sqsum[lid >> 7][ai]; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + int loc_s0 = gid * dst_step + i + lid - 1 - pre_invalid * dst_step / 4, loc_s1 = loc_s0 + dst_step ; + if(lid > 0 && (i+lid) <= rows) + { + lm_sum[0][bf_loc] += sum_t[0]; + lm_sum[1][bf_loc] += sum_t[1]; + lm_sqsum[0][bf_loc] += sqsum_t[0]; + lm_sqsum[1][bf_loc] += sqsum_t[1]; + sum_p = (__local float*)(&(lm_sum[0][bf_loc])); + sqsum_p = (__local float*)(&(lm_sqsum[0][bf_loc])); + for(int k = 0; k < 4; k++) + { + if(gid * 4 + k >= cols + pre_invalid || gid * 4 + k < pre_invalid) continue; + sum[loc_s0 + k * dst_step / 4] = sum_p[k]; + sqsum[loc_s0 + k * dst_step / 4] = sqsum_p[k]; + } + sum_p = (__local float*)(&(lm_sum[1][bf_loc])); + sqsum_p = (__local float*)(&(lm_sqsum[1][bf_loc])); + for(int k = 0; k < 4; k++) + { + if(gid * 4 + k + 4 >= cols + pre_invalid) break; + sum[loc_s1 + k * dst_step / 4] = sum_p[k]; + sqsum[loc_s1 + k * dst_step / 4] = sqsum_p[k]; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + } +} + + +kernel void integral_rows_D5(__global float4 *srcsum,__global float4 * srcsqsum,__global float *sum , + __global float *sqsum,int rows,int cols,int src_step,int sum_step, + int sqsum_step,int sum_offset,int sqsum_offset) +{ + unsigned int lid = get_local_id(0); + unsigned int gid = get_group_id(0); + float4 src_t[2], sum_t[2]; + float4 sqsrc_t[2],sqsum_t[2]; + __local float4 lm_sum[2][LSIZE + LOG_LSIZE]; + __local float4 lm_sqsum[2][LSIZE + LOG_LSIZE]; + __local float *sum_p; + __local float *sqsum_p; + src_step = src_step >> 4; + for(int i = 0; i < rows; i =i + LSIZE_1) + { + src_t[0] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2] : (float4)0; + sqsrc_t[0] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2] : (float4)0; + src_t[1] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2 + 1] : (float4)0; + sqsrc_t[1] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2 + 1] : (float4)0; + + sum_t[0] = (i == 0 ? (float4)0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]); + sqsum_t[0] = (i == 0 ? (float4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]); + sum_t[1] = (i == 0 ? (float4)0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]); + sqsum_t[1] = (i == 0 ? (float4)0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]); + barrier(CLK_LOCAL_MEM_FENCE); + + int bf_loc = lid + GET_CONFLICT_OFFSET(lid); + lm_sum[0][bf_loc] = src_t[0]; + lm_sqsum[0][bf_loc] = sqsrc_t[0]; + + lm_sum[1][bf_loc] = src_t[1]; + lm_sqsum[1][bf_loc] = sqsrc_t[1]; + + int offset = 1; + for(int d = LSIZE >> 1 ; d > 0; d>>=1) + { + barrier(CLK_LOCAL_MEM_FENCE); + int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset; + ai += GET_CONFLICT_OFFSET(ai); + bi += GET_CONFLICT_OFFSET(bi); + + if((lid & 127) < d) + { + lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai]; + lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai]; + } + offset <<= 1; + } + barrier(CLK_LOCAL_MEM_FENCE); + if(lid < 2) + { + lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0; + lm_sqsum[lid][LSIZE_2 + LOG_LSIZE] = 0; + } + for(int d = 1; d < LSIZE; d <<= 1) + { + barrier(CLK_LOCAL_MEM_FENCE); + offset >>= 1; + int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset; + ai += GET_CONFLICT_OFFSET(ai); + bi += GET_CONFLICT_OFFSET(bi); + + if((lid & 127) < d) + { + lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai]; + lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai]; + + lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai]; + lm_sqsum[lid >> 7][ai] = lm_sqsum[lid >> 7][bi] - lm_sqsum[lid >> 7][ai]; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + if(gid == 0 && (i + lid) <= rows) + { + sum[sum_offset + i + lid] = 0; + sqsum[sqsum_offset + i + lid] = 0; + } + if(i + lid == 0) + { + int loc0 = gid * 2 * sum_step; + int loc1 = gid * 2 * sqsum_step; + for(int k = 1; k <= 8; k++) + { + if(gid * 8 + k > cols) break; + sum[sum_offset + loc0 + k * sum_step / 4] = 0; + sqsum[sqsum_offset + loc1 + k * sqsum_step / 4] = 0; + } + } + int loc_s0 = sum_offset + gid * 2 * sum_step + sum_step / 4 + i + lid, loc_s1 = loc_s0 + sum_step ; + int loc_sq0 = sqsum_offset + gid * 2 * sqsum_step + sqsum_step / 4 + i + lid, loc_sq1 = loc_sq0 + sqsum_step ; + if(lid > 0 && (i+lid) <= rows) + { + lm_sum[0][bf_loc] += sum_t[0]; + lm_sum[1][bf_loc] += sum_t[1]; + lm_sqsum[0][bf_loc] += sqsum_t[0]; + lm_sqsum[1][bf_loc] += sqsum_t[1]; + sum_p = (__local float*)(&(lm_sum[0][bf_loc])); + sqsum_p = (__local float*)(&(lm_sqsum[0][bf_loc])); + for(int k = 0; k < 4; k++) + { + if(gid * 8 + k >= cols) break; + sum[loc_s0 + k * sum_step / 4] = sum_p[k]; + sqsum[loc_sq0 + k * sqsum_step / 4] = sqsum_p[k]; + } + sum_p = (__local float*)(&(lm_sum[1][bf_loc])); + sqsum_p = (__local float*)(&(lm_sqsum[1][bf_loc])); + for(int k = 0; k < 4; k++) + { + if(gid * 8 + 4 + k >= cols) break; + sum[loc_s1 + k * sum_step / 4] = sum_p[k]; + sqsum[loc_sq1 + k * sqsum_step / 4] = sqsum_p[k]; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + } +} diff --git a/modules/imgproc/src/opencl/integral_sum.cl b/modules/imgproc/src/opencl/integral_sum.cl new file mode 100644 index 0000000000..ee063a558a --- /dev/null +++ b/modules/imgproc/src/opencl/integral_sum.cl @@ -0,0 +1,412 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Shengen Yan,yanshengen@gmail.com +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors as is and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if defined (DOUBLE_SUPPORT) +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64:enable +#elif defined (cl_amd_fp64) +#pragma OPENCL EXTENSION cl_amd_fp64:enable +#endif +#endif + +#define LSIZE 256 +#define LSIZE_1 255 +#define LSIZE_2 254 +#define HF_LSIZE 128 +#define LOG_LSIZE 8 +#define LOG_NUM_BANKS 5 +#define NUM_BANKS 32 +#define GET_CONFLICT_OFFSET(lid) ((lid) >> LOG_NUM_BANKS) + + +kernel void integral_sum_cols_D4(__global uchar4 *src,__global int *sum , + int src_offset,int pre_invalid,int rows,int cols,int src_step,int dst_step) +{ + unsigned int lid = get_local_id(0); + unsigned int gid = get_group_id(0); + int4 src_t[2], sum_t[2]; + __local int4 lm_sum[2][LSIZE + LOG_LSIZE]; + __local int* sum_p; + src_step = src_step >> 2; + gid = gid << 1; + for(int i = 0; i < rows; i =i + LSIZE_1) + { + src_t[0] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + gid]) : 0); + src_t[1] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + gid + 1]) : 0); + + sum_t[0] = (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]); + sum_t[1] = (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]); + barrier(CLK_LOCAL_MEM_FENCE); + + int bf_loc = lid + GET_CONFLICT_OFFSET(lid); + lm_sum[0][bf_loc] = src_t[0]; + + lm_sum[1][bf_loc] = src_t[1]; + + int offset = 1; + for(int d = LSIZE >> 1 ; d > 0; d>>=1) + { + barrier(CLK_LOCAL_MEM_FENCE); + int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset; + ai += GET_CONFLICT_OFFSET(ai); + bi += GET_CONFLICT_OFFSET(bi); + + if((lid & 127) < d) + { + lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai]; + } + offset <<= 1; + } + barrier(CLK_LOCAL_MEM_FENCE); + if(lid < 2) + { + lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0; + } + for(int d = 1; d < LSIZE; d <<= 1) + { + barrier(CLK_LOCAL_MEM_FENCE); + offset >>= 1; + int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset; + ai += GET_CONFLICT_OFFSET(ai); + bi += GET_CONFLICT_OFFSET(bi); + + if((lid & 127) < d) + { + lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai]; + lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai]; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + if(lid > 0 && (i+lid) <= rows) + { + int loc_s0 = gid * dst_step + i + lid - 1 - pre_invalid * dst_step / 4, loc_s1 = loc_s0 + dst_step ; + lm_sum[0][bf_loc] += sum_t[0]; + lm_sum[1][bf_loc] += sum_t[1]; + sum_p = (__local int*)(&(lm_sum[0][bf_loc])); + for(int k = 0; k < 4; k++) + { + if(gid * 4 + k >= cols + pre_invalid || gid * 4 + k < pre_invalid) continue; + sum[loc_s0 + k * dst_step / 4] = sum_p[k]; + } + sum_p = (__local int*)(&(lm_sum[1][bf_loc])); + for(int k = 0; k < 4; k++) + { + if(gid * 4 + k + 4 >= cols + pre_invalid) break; + sum[loc_s1 + k * dst_step / 4] = sum_p[k]; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + } +} + + +kernel void integral_sum_rows_D4(__global int4 *srcsum,__global int *sum , + int rows,int cols,int src_step,int sum_step, + int sum_offset) +{ + unsigned int lid = get_local_id(0); + unsigned int gid = get_group_id(0); + int4 src_t[2], sum_t[2]; + __local int4 lm_sum[2][LSIZE + LOG_LSIZE]; + __local int *sum_p; + src_step = src_step >> 4; + for(int i = 0; i < rows; i =i + LSIZE_1) + { + src_t[0] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2] : 0; + src_t[1] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2 + 1] : 0; + + sum_t[0] = (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]); + sum_t[1] = (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]); + barrier(CLK_LOCAL_MEM_FENCE); + + int bf_loc = lid + GET_CONFLICT_OFFSET(lid); + lm_sum[0][bf_loc] = src_t[0]; + + lm_sum[1][bf_loc] = src_t[1]; + + int offset = 1; + for(int d = LSIZE >> 1 ; d > 0; d>>=1) + { + barrier(CLK_LOCAL_MEM_FENCE); + int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset; + ai += GET_CONFLICT_OFFSET(ai); + bi += GET_CONFLICT_OFFSET(bi); + + if((lid & 127) < d) + { + lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai]; + } + offset <<= 1; + } + barrier(CLK_LOCAL_MEM_FENCE); + if(lid < 2) + { + lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0; + } + for(int d = 1; d < LSIZE; d <<= 1) + { + barrier(CLK_LOCAL_MEM_FENCE); + offset >>= 1; + int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset; + ai += GET_CONFLICT_OFFSET(ai); + bi += GET_CONFLICT_OFFSET(bi); + + if((lid & 127) < d) + { + lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai]; + lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai]; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + if(gid == 0 && (i + lid) <= rows) + { + sum[sum_offset + i + lid] = 0; + } + if(i + lid == 0) + { + int loc0 = gid * 2 * sum_step; + for(int k = 1; k <= 8; k++) + { + if(gid * 8 + k > cols) break; + sum[sum_offset + loc0 + k * sum_step / 4] = 0; + } + } + + if(lid > 0 && (i+lid) <= rows) + { + int loc_s0 = sum_offset + gid * 2 * sum_step + sum_step / 4 + i + lid, loc_s1 = loc_s0 + sum_step ; + lm_sum[0][bf_loc] += sum_t[0]; + lm_sum[1][bf_loc] += sum_t[1]; + sum_p = (__local int*)(&(lm_sum[0][bf_loc])); + for(int k = 0; k < 4; k++) + { + if(gid * 8 + k >= cols) break; + sum[loc_s0 + k * sum_step / 4] = sum_p[k]; + } + sum_p = (__local int*)(&(lm_sum[1][bf_loc])); + for(int k = 0; k < 4; k++) + { + if(gid * 8 + 4 + k >= cols) break; + sum[loc_s1 + k * sum_step / 4] = sum_p[k]; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + } +} + +kernel void integral_sum_cols_D5(__global uchar4 *src,__global float *sum , + int src_offset,int pre_invalid,int rows,int cols,int src_step,int dst_step) +{ + unsigned int lid = get_local_id(0); + unsigned int gid = get_group_id(0); + float4 src_t[2], sum_t[2]; + __local float4 lm_sum[2][LSIZE + LOG_LSIZE]; + __local float* sum_p; + src_step = src_step >> 2; + gid = gid << 1; + for(int i = 0; i < rows; i =i + LSIZE_1) + { + src_t[0] = (i + lid < rows ? convert_float4(src[src_offset + (lid+i) * src_step + gid]) : (float4)0); + src_t[1] = (i + lid < rows ? convert_float4(src[src_offset + (lid+i) * src_step + gid + 1]) : (float4)0); + + sum_t[0] = (i == 0 ? (float4)0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]); + sum_t[1] = (i == 0 ? (float4)0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]); + barrier(CLK_LOCAL_MEM_FENCE); + + int bf_loc = lid + GET_CONFLICT_OFFSET(lid); + lm_sum[0][bf_loc] = src_t[0]; + + lm_sum[1][bf_loc] = src_t[1]; + + int offset = 1; + for(int d = LSIZE >> 1 ; d > 0; d>>=1) + { + barrier(CLK_LOCAL_MEM_FENCE); + int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset; + ai += GET_CONFLICT_OFFSET(ai); + bi += GET_CONFLICT_OFFSET(bi); + + if((lid & 127) < d) + { + lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai]; + } + offset <<= 1; + } + barrier(CLK_LOCAL_MEM_FENCE); + if(lid < 2) + { + lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0; + } + for(int d = 1; d < LSIZE; d <<= 1) + { + barrier(CLK_LOCAL_MEM_FENCE); + offset >>= 1; + int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset; + ai += GET_CONFLICT_OFFSET(ai); + bi += GET_CONFLICT_OFFSET(bi); + + if((lid & 127) < d) + { + lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai]; + lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai]; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + if(lid > 0 && (i+lid) <= rows) + { + int loc_s0 = gid * dst_step + i + lid - 1 - pre_invalid * dst_step / 4, loc_s1 = loc_s0 + dst_step ; + lm_sum[0][bf_loc] += sum_t[0]; + lm_sum[1][bf_loc] += sum_t[1]; + sum_p = (__local float*)(&(lm_sum[0][bf_loc])); + for(int k = 0; k < 4; k++) + { + if(gid * 4 + k >= cols + pre_invalid || gid * 4 + k < pre_invalid) continue; + sum[loc_s0 + k * dst_step / 4] = sum_p[k]; + } + sum_p = (__local float*)(&(lm_sum[1][bf_loc])); + for(int k = 0; k < 4; k++) + { + if(gid * 4 + k + 4 >= cols + pre_invalid) break; + sum[loc_s1 + k * dst_step / 4] = sum_p[k]; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + } +} + + +kernel void integral_sum_rows_D5(__global float4 *srcsum,__global float *sum , + int rows,int cols,int src_step,int sum_step, + int sum_offset) +{ + unsigned int lid = get_local_id(0); + unsigned int gid = get_group_id(0); + float4 src_t[2], sum_t[2]; + __local float4 lm_sum[2][LSIZE + LOG_LSIZE]; + __local float *sum_p; + src_step = src_step >> 4; + for(int i = 0; i < rows; i =i + LSIZE_1) + { + src_t[0] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2] : (float4)0; + src_t[1] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2 + 1] : (float4)0; + + sum_t[0] = (i == 0 ? (float4)0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]); + sum_t[1] = (i == 0 ? (float4)0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]); + barrier(CLK_LOCAL_MEM_FENCE); + + int bf_loc = lid + GET_CONFLICT_OFFSET(lid); + lm_sum[0][bf_loc] = src_t[0]; + + lm_sum[1][bf_loc] = src_t[1]; + + int offset = 1; + for(int d = LSIZE >> 1 ; d > 0; d>>=1) + { + barrier(CLK_LOCAL_MEM_FENCE); + int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset; + ai += GET_CONFLICT_OFFSET(ai); + bi += GET_CONFLICT_OFFSET(bi); + + if((lid & 127) < d) + { + lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai]; + } + offset <<= 1; + } + barrier(CLK_LOCAL_MEM_FENCE); + if(lid < 2) + { + lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0; + } + for(int d = 1; d < LSIZE; d <<= 1) + { + barrier(CLK_LOCAL_MEM_FENCE); + offset >>= 1; + int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset; + ai += GET_CONFLICT_OFFSET(ai); + bi += GET_CONFLICT_OFFSET(bi); + + if((lid & 127) < d) + { + lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai]; + lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai]; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + if(gid == 0 && (i + lid) <= rows) + { + sum[sum_offset + i + lid] = 0; + } + if(i + lid == 0) + { + int loc0 = gid * 2 * sum_step; + for(int k = 1; k <= 8; k++) + { + if(gid * 8 + k > cols) break; + sum[sum_offset + loc0 + k * sum_step / 4] = 0; + } + } + + if(lid > 0 && (i+lid) <= rows) + { + int loc_s0 = sum_offset + gid * 2 * sum_step + sum_step / 4 + i + lid, loc_s1 = loc_s0 + sum_step ; + lm_sum[0][bf_loc] += sum_t[0]; + lm_sum[1][bf_loc] += sum_t[1]; + sum_p = (__local float*)(&(lm_sum[0][bf_loc])); + for(int k = 0; k < 4; k++) + { + if(gid * 8 + k >= cols) break; + sum[loc_s0 + k * sum_step / 4] = sum_p[k]; + } + sum_p = (__local float*)(&(lm_sum[1][bf_loc])); + for(int k = 0; k < 4; k++) + { + if(gid * 8 + 4 + k >= cols) break; + sum[loc_s1 + k * sum_step / 4] = sum_p[k]; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + } +} diff --git a/modules/imgproc/src/opencl/laplacian.cl b/modules/imgproc/src/opencl/laplacian.cl new file mode 100644 index 0000000000..ea22967dff --- /dev/null +++ b/modules/imgproc/src/opencl/laplacian.cl @@ -0,0 +1,381 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Pang Erping, erping@multicorewareinc.com +// Jia Haipeng, jiahaipeng95@gmail.com +// Peng Xiao, pengxiao@outlook.com +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors as is and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////Macro for border type//////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////////////////////////////// +#ifdef BORDER_REPLICATE + +//BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh +#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (l_edge) : (i)) +#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (r_edge)-1 : (addr)) +#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) : (i)) +#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (b_edge)-1 :(addr)) +#endif + +#ifdef BORDER_REFLECT +#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? ((l_edge)<<1)-(i)-1 : (i)) +#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr)) +#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? ((t_edge)<<1)-(i)-1 : (i)) +#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr)) +#endif + +#ifdef BORDER_REFLECT_101 +//BORDER_REFLECT_101: gfedcb|abcdefgh|gfedcba +#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? ((l_edge)<<1)-(i) : (i)) +#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr)) +#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? ((t_edge)<<1)-(i) : (i)) +#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr)) +#endif + +#ifdef IMG_C_1_0 +#define T_IMG uchar +#define T_IMGx4 uchar4 +#define T_IMG_C1 uchar +#define CONVERT_TYPE convert_uchar_sat +#define CONVERT_TYPEx4 convert_uchar4_sat +#endif +#ifdef IMG_C_4_0 +#define T_IMG uchar4 +#define T_IMGx4 uchar16 +#define T_IMG_C1 uchar +#define CONVERT_TYPE convert_uchar4_sat +#define CONVERT_TYPEx4 convert_uchar16_sat +#endif +#ifdef IMG_C_1_5 +#define T_IMG float +#define T_IMGx4 float4 +#define T_IMG_C1 float +#define CONVERT_TYPE convert_float +#define CONVERT_TYPEx4 convert_float4 +#endif +#ifdef IMG_C_4_5 +#define T_IMG float4 +#define T_IMGx4 float16 +#define T_IMG_C1 float +#define CONVERT_TYPE convert_float4 +#define CONVERT_TYPEx4 convert_float16 +#endif + +#ifndef CN +#define CN 1 +#endif + +#if CN == 1 +#define T_SUM float +#define T_SUMx4 float4 +#define CONVERT_TYPE_SUM convert_float +#define CONVERT_TYPE_SUMx4 convert_float4 +#define SUM_ZERO (0.0f) +#define SUM_ZEROx4 (0.0f, 0.0f, 0.0f, 0.0f) +#define VLOAD4 vload4 +#define SX x +#define SY y +#define SZ z +#define SW w +#elif CN == 4 +#define T_SUM float4 +#define T_SUMx4 float16 +#define CONVERT_TYPE_SUM convert_float4 +#define CONVERT_TYPE_SUMx4 convert_float16 +#define SUM_ZERO (0.0f, 0.0f, 0.0f, 0.0f) +#define SUM_ZEROx4 (0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f) +#define VLOAD4 vload16 +#define SX s0123 +#define SY s4567 +#define SZ s89ab +#define SW scdef +#endif + +#ifndef FILTER_SIZE +#define FILTER_SIZE 3 +#endif + +#define LOCAL_GROUP_SIZE 16 + +#define LOCAL_WIDTH ((FILTER_SIZE/2)*2 + LOCAL_GROUP_SIZE) +#define LOCAL_HEIGHT ((FILTER_SIZE/2)*2 + LOCAL_GROUP_SIZE) + +#define FILTER_RADIUS (FILTER_SIZE >> 1) + +__kernel void filter2D( + __global T_IMG *src, + __global T_IMG *dst, + int src_step, + int dst_step, + __constant float *mat_kernel, + __local T_IMG *local_data, + int wholerows, + int wholecols, + int src_offset_x, + int src_offset_y, + int dst_offset_x, + int dst_offset_y, + int cols, + int rows, + int operate_cols +) +{ + int groupStartCol = get_group_id(0) * get_local_size(0); + int groupStartRow = get_group_id(1) * get_local_size(1); + + int localCol = get_local_id(0); + int localRow = get_local_id(1); + int globalCol = groupStartCol + localCol; + int globalRow = groupStartRow + localRow; + const int src_offset = mad24(src_offset_y, src_step, src_offset_x); + const int dst_offset = mad24(dst_offset_y, dst_step, dst_offset_x); + +#ifdef BORDER_CONSTANT + for(int i = localRow; i < LOCAL_HEIGHT; i += get_local_size(1)) + { + int curRow = groupStartRow + i; + for(int j = localCol; j < LOCAL_WIDTH; j += get_local_size(0)) + { + int curCol = groupStartCol + j; + if(curRow < FILTER_RADIUS - src_offset_y || (curRow - FILTER_RADIUS) >= wholerows - src_offset_y|| + curCol < FILTER_RADIUS - src_offset_x || (curCol - FILTER_RADIUS) >= wholecols - src_offset_x) + { + local_data[(i) * LOCAL_WIDTH + j] = 0; + } + else + { + local_data[(i) * LOCAL_WIDTH + j] = src[(curRow - FILTER_RADIUS) * src_step + curCol - FILTER_RADIUS + src_offset]; + } + } + } +#else + for(int i = localRow; i < LOCAL_HEIGHT; i += get_local_size(1)) + { + int curRow = groupStartRow + i; + + curRow = ADDR_H(curRow, FILTER_RADIUS - src_offset_y, wholerows - src_offset_y); + + curRow = ADDR_B(curRow - FILTER_RADIUS, wholerows - src_offset_y, curRow - FILTER_RADIUS); + + for(int j = localCol; j < LOCAL_WIDTH; j += get_local_size(0)) + { + int curCol = groupStartCol + j; + curCol = ADDR_L(curCol, FILTER_RADIUS - src_offset_x, wholecols - src_offset_x); + curCol = ADDR_R(curCol - FILTER_RADIUS, wholecols - src_offset_x, curCol - FILTER_RADIUS); + if(curRow < wholerows && curCol < wholecols) + { + local_data[(i) * LOCAL_WIDTH + j] = src[(curRow) * src_step + curCol + src_offset]; + } + } + } +#endif + + barrier(CLK_LOCAL_MEM_FENCE); + if(globalRow < rows && globalCol < cols) + { + T_SUM sum = (T_SUM)(SUM_ZERO); + int filterIdx = 0; + for(int i = 0; i < FILTER_SIZE; i++) + { + int offset = (i + localRow) * LOCAL_WIDTH; + + for(int j = 0; j < FILTER_SIZE; j++) + { + sum += CONVERT_TYPE_SUM(local_data[offset + j + localCol]) * mat_kernel[filterIdx++]; + } + } + dst[(globalRow)*dst_step + (globalCol) + dst_offset] = CONVERT_TYPE(sum); + } +} + +/// following is specific for 3x3 kernels + +////////////////////////////////////////////////////////////////////////////////////////////////////// +/////////////////////////////Macro for define elements number per thread///////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#define ANX 1 +#define ANY 1 + +#define ROWS_PER_GROUP 4 +#define ROWS_PER_GROUP_BITS 2 +#define ROWS_FETCH (ROWS_PER_GROUP + ANY + ANY) //(ROWS_PER_GROUP + anY * 2) + +#define THREADS_PER_ROW 64 +#define THREADS_PER_ROW_BIT 6 + +#define ELEMENTS_PER_THREAD 4 +#define ELEMENTS_PER_THREAD_BIT 2 + +#define LOCAL_MEM_STEP 260 //divup((get_local_size(0) + anX * 2), 4) * 4 + +/////////////////////////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////8uC1//////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +__kernel void filter2D_3x3( + __global T_IMG *src, + __global T_IMG *dst, + int src_step, + int dst_step, + __constant float *mat_kernel, + __local T_IMG *local_data, + int wholerows, + int wholecols, + int src_offset_x, + int src_offset_y, + int dst_offset_x, + int dst_offset_y, + int cols, + int rows, + int operate_cols +) +{ + int gX = get_global_id(0); + int gY = get_global_id(1); + + int lX = get_local_id(0); + + int groupX_size = get_local_size(0); + int groupX_id = get_group_id(0); + +#define dst_align (dst_offset_x & 3) + int cols_start_index_group = src_offset_x - dst_align + groupX_size * groupX_id - ANX; + int rows_start_index = src_offset_y + (gY << ROWS_PER_GROUP_BITS) - ANY; + + if((gY << 2) < rows) + { + for(int i = 0; i < ROWS_FETCH; ++i) + { + if((rows_start_index - src_offset_y) + i < rows + ANY) + { +#ifdef BORDER_CONSTANT + int selected_row = rows_start_index + i; + int selected_cols = cols_start_index_group + lX; + + T_IMG data = src[mad24(selected_row, src_step, selected_cols)]; + int con = selected_row >= 0 && selected_row < wholerows && selected_cols >= 0 && selected_cols < wholecols; + data = con ? data : (T_IMG)(0); + local_data[mad24(i, LOCAL_MEM_STEP, lX)] = data; + + if(lX < (ANX << 1)) + { + selected_cols = cols_start_index_group + lX + groupX_size; + + data = src[mad24(selected_row, src_step, selected_cols)]; + con = selected_row >= 0 && selected_row < wholerows && selected_cols >= 0 && selected_cols < wholecols; + data = con ? data : (T_IMG)(0); + local_data[mad24(i, LOCAL_MEM_STEP, lX) + groupX_size] = data; + } +#else + int selected_row = ADDR_H(rows_start_index + i, 0, wholerows); + selected_row = ADDR_B(rows_start_index + i, wholerows, selected_row); + + int selected_cols = ADDR_L(cols_start_index_group + lX, 0, wholecols); + selected_cols = ADDR_R(cols_start_index_group + lX, wholecols, selected_cols); + + T_IMG data = src[mad24(selected_row, src_step, selected_cols)]; + + local_data[mad24(i, LOCAL_MEM_STEP, lX)] = data; + + if(lX < (ANX << 1)) + { + selected_cols = cols_start_index_group + lX + groupX_size; + selected_cols = ADDR_R(selected_cols, wholecols, selected_cols); + + data = src[mad24(selected_row, src_step, selected_cols)]; + local_data[mad24(i, LOCAL_MEM_STEP, lX) + groupX_size] = data; + } +#endif + } + } + } + barrier(CLK_LOCAL_MEM_FENCE); + + int process_col = groupX_size * groupX_id + ((lX % THREADS_PER_ROW) << 2); + if(((gY << 2) < rows) && (process_col < operate_cols)) + { + int dst_cols_start = dst_offset_x; + int dst_cols_end = dst_offset_x + cols; + int dst_cols_index = (dst_offset_x + process_col) & 0xfffffffc; + + int dst_rows_end = dst_offset_y + rows; + int dst_rows_index = dst_offset_y + (gY << ROWS_PER_GROUP_BITS) + (lX >> THREADS_PER_ROW_BIT); + dst = dst + mad24(dst_rows_index, dst_step, dst_cols_index); + + T_IMGx4 dst_data = *(__global T_IMGx4 *)dst; + + T_SUMx4 sum = (T_SUMx4)SUM_ZEROx4; + T_IMGx4 data; + + for(int i = 0; i < FILTER_SIZE; i++) + { +#pragma unroll + for(int j = 0; j < FILTER_SIZE; j++) + { + if(dst_rows_index < dst_rows_end) + { + int local_row = (lX >> THREADS_PER_ROW_BIT) + i; + int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j; + + data = VLOAD4(0, (__local T_IMG_C1 *)(local_data + local_row * LOCAL_MEM_STEP + local_cols)); + sum = sum + (mat_kernel[i * FILTER_SIZE + j] * CONVERT_TYPE_SUMx4(data)); + } + } + } + + if(dst_rows_index < dst_rows_end) + { + T_IMGx4 tmp_dst = CONVERT_TYPEx4(sum); + tmp_dst.SX = ((dst_cols_index + 0 >= dst_cols_start) && (dst_cols_index + 0 < dst_cols_end)) ? + tmp_dst.SX : dst_data.SX; + tmp_dst.SY = ((dst_cols_index + 1 >= dst_cols_start) && (dst_cols_index + 1 < dst_cols_end)) ? + tmp_dst.SY : dst_data.SY; + tmp_dst.SZ = ((dst_cols_index + 2 >= dst_cols_start) && (dst_cols_index + 2 < dst_cols_end)) ? + tmp_dst.SZ : dst_data.SZ; + tmp_dst.SW = ((dst_cols_index + 3 >= dst_cols_start) && (dst_cols_index + 3 < dst_cols_end)) ? + tmp_dst.SW : dst_data.SW; + *(__global T_IMGx4 *)dst = tmp_dst; + } + } +} diff --git a/modules/imgproc/src/opencl/match_template.cl b/modules/imgproc/src/opencl/match_template.cl new file mode 100644 index 0000000000..6fc4c748cf --- /dev/null +++ b/modules/imgproc/src/opencl/match_template.cl @@ -0,0 +1,857 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Peng Xiao, pengxiao@multicorewareinc.com +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors as is and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#pragma OPENCL EXTENSION cl_amd_printf : enable + +#if defined (DOUBLE_SUPPORT) + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64:enable +#elif defined (cl_amd_fp64) +#pragma OPENCL EXTENSION cl_amd_fp64:enable +#endif + +#define TYPE_IMAGE_SQSUM double +#else +#define TYPE_IMAGE_SQSUM float +#endif + +#ifndef CN4 +#define CN4 1 +#else +#define CN4 4 +#endif + +////////////////////////////////////////////////// +// utilities +#define SQSUMS_PTR(ox, oy) mad24(gidy + oy, img_sqsums_step, (gidx + img_sqsums_offset + ox) * CN4) +#define SUMS_PTR(ox, oy) mad24(gidy + oy, img_sums_step, gidx + img_sums_offset + ox) +// normAcc* are accurate normalization routines which make GPU matchTemplate +// consistent with CPU one +float normAcc(float num, float denum) +{ + if(fabs(num) < denum) + { + return num / denum; + } + if(fabs(num) < denum * 1.125f) + { + return num > 0 ? 1 : -1; + } + return 0; +} + +float normAcc_SQDIFF(float num, float denum) +{ + if(fabs(num) < denum) + { + return num / denum; + } + if(fabs(num) < denum * 1.125f) + { + return num > 0 ? 1 : -1; + } + return 1; +} +////////////////////////////////////////////////////////////////////// +// normalize + +__kernel +void normalizeKernel_C1_D0 +( + __global const float * img_sqsums, + __global float * res, + ulong tpl_sqsum, + int res_rows, + int res_cols, + int tpl_rows, + int tpl_cols, + int img_sqsums_offset, + int img_sqsums_step, + int res_offset, + int res_step +) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + + res_step /= sizeof(*res); + res_offset /= sizeof(*res); + img_sqsums_step /= sizeof(*img_sqsums); + img_sqsums_offset /= sizeof(*img_sqsums); + int res_idx = mad24(gidy, res_step, res_offset + gidx); + if(gidx < res_cols && gidy < res_rows) + { + float image_sqsum_ = (float)( + (img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) - + (img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)])); + res[res_idx] = normAcc(res[res_idx], sqrt(image_sqsum_ * tpl_sqsum)); + } +} + +__kernel +void matchTemplate_Prepared_SQDIFF_C1_D0 +( + __global const TYPE_IMAGE_SQSUM * img_sqsums, + __global float * res, + ulong tpl_sqsum, + int res_rows, + int res_cols, + int tpl_rows, + int tpl_cols, + int img_sqsums_offset, + int img_sqsums_step, + int res_offset, + int res_step +) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + + res_step /= sizeof(*res); + res_offset /= sizeof(*res); + img_sqsums_step /= sizeof(*img_sqsums); + img_sqsums_offset /= sizeof(*img_sqsums); + int res_idx = mad24(gidy, res_step, res_offset + gidx); + if(gidx < res_cols && gidy < res_rows) + { + float image_sqsum_ = (float)( + (img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) - + (img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)])); + res[res_idx] = image_sqsum_ - 2.f * res[res_idx] + tpl_sqsum; + } +} + +__kernel +void matchTemplate_Prepared_SQDIFF_NORMED_C1_D0 +( + __global const float * img_sqsums, + __global float * res, + ulong tpl_sqsum, + int res_rows, + int res_cols, + int tpl_rows, + int tpl_cols, + int img_sqsums_offset, + int img_sqsums_step, + int res_offset, + int res_step +) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + + res_step /= sizeof(*res); + res_offset /= sizeof(*res); + img_sqsums_step /= sizeof(*img_sqsums); + img_sqsums_offset /= sizeof(*img_sqsums); + int res_idx = mad24(gidy, res_step, res_offset + gidx); + if(gidx < res_cols && gidy < res_rows) + { + float image_sqsum_ = (float)( + (img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) - + (img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)])); + res[res_idx] = normAcc_SQDIFF(image_sqsum_ - 2.f * res[res_idx] + tpl_sqsum, + sqrt(image_sqsum_ * tpl_sqsum)); + } +} + +////////////////////////////////////////////////// +// SQDIFF +__kernel +void matchTemplate_Naive_SQDIFF_C1_D0 +( + __global const uchar * img, + __global const uchar * tpl, + __global float * res, + int img_rows, + int img_cols, + int tpl_rows, + int tpl_cols, + int res_rows, + int res_cols, + int img_offset, + int tpl_offset, + int res_offset, + int img_step, + int tpl_step, + int res_step +) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + int i,j; + int delta; + int sum = 0; + res_step /= sizeof(*res); + res_offset /= sizeof(*res); + int res_idx = mad24(gidy, res_step, res_offset + gidx); + + if(gidx < res_cols && gidy < res_rows) + { + for(i = 0; i < tpl_rows; i ++) + { + // get specific rows of img data + __global const uchar * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset); + __global const uchar * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset); + for(j = 0; j < tpl_cols; j ++) + { + delta = img_ptr[j] - tpl_ptr[j]; + sum = mad24(delta, delta, sum); + } + } + res[res_idx] = sum; + } +} + +__kernel +void matchTemplate_Naive_SQDIFF_C1_D5 +( + __global const float * img, + __global const float * tpl, + __global float * res, + int img_rows, + int img_cols, + int tpl_rows, + int tpl_cols, + int res_rows, + int res_cols, + int img_offset, + int tpl_offset, + int res_offset, + int img_step, + int tpl_step, + int res_step +) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + int i,j; + float delta; + float sum = 0; + img_step /= sizeof(*img); + img_offset /= sizeof(*img); + tpl_step /= sizeof(*tpl); + tpl_offset /= sizeof(*tpl); + res_step /= sizeof(*res); + res_offset /= sizeof(*res); + + int res_idx = mad24(gidy, res_step, res_offset + gidx); + + if(gidx < res_cols && gidy < res_rows) + { + for(i = 0; i < tpl_rows; i ++) + { + // get specific rows of img data + __global const float * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset); + __global const float * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset); + for(j = 0; j < tpl_cols; j ++) + { + delta = img_ptr[j] - tpl_ptr[j]; + sum = mad(delta, delta, sum); + } + } + res[res_idx] = sum; + } +} + +__kernel +void matchTemplate_Naive_SQDIFF_C4_D0 +( + __global const uchar4 * img, + __global const uchar4 * tpl, + __global float * res, + int img_rows, + int img_cols, + int tpl_rows, + int tpl_cols, + int res_rows, + int res_cols, + int img_offset, + int tpl_offset, + int res_offset, + int img_step, + int tpl_step, + int res_step +) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + int i,j; + int4 delta; + int4 sum = (int4)(0, 0, 0, 0); + img_step /= sizeof(*img); + img_offset /= sizeof(*img); + tpl_step /= sizeof(*tpl); + tpl_offset /= sizeof(*tpl); + res_step /= sizeof(*res); + res_offset /= sizeof(*res); + + int res_idx = mad24(gidy, res_step, res_offset + gidx); + + if(gidx < res_cols && gidy < res_rows) + { + for(i = 0; i < tpl_rows; i ++) + { + // get specific rows of img data + __global const uchar4 * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset); + __global const uchar4 * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset); + for(j = 0; j < tpl_cols; j ++) + { + //delta = convert_int4(img_ptr[j] - tpl_ptr[j]); // this alternative is incorrect + delta.x = img_ptr[j].x - tpl_ptr[j].x; + delta.y = img_ptr[j].y - tpl_ptr[j].y; + delta.z = img_ptr[j].z - tpl_ptr[j].z; + delta.w = img_ptr[j].w - tpl_ptr[j].w; + sum = mad24(delta, delta, sum); + } + } + res[res_idx] = sum.x + sum.y + sum.z + sum.w; + } +} + +__kernel +void matchTemplate_Naive_SQDIFF_C4_D5 +( + __global const float4 * img, + __global const float4 * tpl, + __global float * res, + int img_rows, + int img_cols, + int tpl_rows, + int tpl_cols, + int res_rows, + int res_cols, + int img_offset, + int tpl_offset, + int res_offset, + int img_step, + int tpl_step, + int res_step +) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + int i,j; + float4 delta; + float4 sum = (float4)(0, 0, 0, 0); + img_step /= sizeof(*img); + img_offset /= sizeof(*img); + tpl_step /= sizeof(*tpl); + tpl_offset /= sizeof(*tpl); + res_step /= sizeof(*res); + res_offset /= sizeof(*res); + + int res_idx = mad24(gidy, res_step, res_offset + gidx); + + if(gidx < res_cols && gidy < res_rows) + { + for(i = 0; i < tpl_rows; i ++) + { + // get specific rows of img data + __global const float4 * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset); + __global const float4 * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset); + for(j = 0; j < tpl_cols; j ++) + { + //delta = convert_int4(img_ptr[j] - tpl_ptr[j]); // this alternative is incorrect + delta.x = img_ptr[j].x - tpl_ptr[j].x; + delta.y = img_ptr[j].y - tpl_ptr[j].y; + delta.z = img_ptr[j].z - tpl_ptr[j].z; + delta.w = img_ptr[j].w - tpl_ptr[j].w; + sum = mad(delta, delta, sum); + } + } + res[res_idx] = sum.x + sum.y + sum.z + sum.w; + } +} + +////////////////////////////////////////////////// +// CCORR +__kernel +void matchTemplate_Naive_CCORR_C1_D0 +( + __global const uchar * img, + __global const uchar * tpl, + __global float * res, + int img_rows, + int img_cols, + int tpl_rows, + int tpl_cols, + int res_rows, + int res_cols, + int img_offset, + int tpl_offset, + int res_offset, + int img_step, + int tpl_step, + int res_step +) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + int i,j; + int sum = 0; + res_step /= sizeof(*res); + res_offset /= sizeof(*res); + + int res_idx = mad24(gidy, res_step, res_offset + gidx); + + if(gidx < res_cols && gidy < res_rows) + { + for(i = 0; i < tpl_rows; i ++) + { + // get specific rows of img data + __global const uchar * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset); + __global const uchar * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset); + for(j = 0; j < tpl_cols; j ++) + { + sum = mad24(convert_int(img_ptr[j]), convert_int(tpl_ptr[j]), sum); + } + } + res[res_idx] = (float)sum; + } +} + +__kernel +void matchTemplate_Naive_CCORR_C1_D5 +( + __global const float * img, + __global const float * tpl, + __global float * res, + int img_rows, + int img_cols, + int tpl_rows, + int tpl_cols, + int res_rows, + int res_cols, + int img_offset, + int tpl_offset, + int res_offset, + int img_step, + int tpl_step, + int res_step +) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + int i,j; + float sum = 0; + img_step /= sizeof(*img); + img_offset /= sizeof(*img); + tpl_step /= sizeof(*tpl); + tpl_offset /= sizeof(*tpl); + res_step /= sizeof(*res); + res_offset /= sizeof(*res); + + int res_idx = mad24(gidy, res_step, res_offset + gidx); + + if(gidx < res_cols && gidy < res_rows) + { + for(i = 0; i < tpl_rows; i ++) + { + // get specific rows of img data + __global const float * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset); + __global const float * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset); + for(j = 0; j < tpl_cols; j ++) + { + sum = mad(img_ptr[j], tpl_ptr[j], sum); + } + } + res[res_idx] = sum; + } +} + +__kernel +void matchTemplate_Naive_CCORR_C4_D0 +( + __global const uchar4 * img, + __global const uchar4 * tpl, + __global float * res, + int img_rows, + int img_cols, + int tpl_rows, + int tpl_cols, + int res_rows, + int res_cols, + int img_offset, + int tpl_offset, + int res_offset, + int img_step, + int tpl_step, + int res_step +) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + int i,j; + int4 sum = (int4)(0, 0, 0, 0); + img_step /= sizeof(*img); + img_offset /= sizeof(*img); + tpl_step /= sizeof(*tpl); + tpl_offset /= sizeof(*tpl); + res_step /= sizeof(*res); + res_offset /= sizeof(*res); + + int res_idx = mad24(gidy, res_step, res_offset + gidx); + + if(gidx < res_cols && gidy < res_rows) + { + for(i = 0; i < tpl_rows; i ++) + { + // get specific rows of img data + __global const uchar4 * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset); + __global const uchar4 * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset); + for(j = 0; j < tpl_cols; j ++) + { + sum = mad24(convert_int4(img_ptr[j]), convert_int4(tpl_ptr[j]), sum); + } + } + res[res_idx] = (float)(sum.x + sum.y + sum.z + sum.w); + } +} + +__kernel +void matchTemplate_Naive_CCORR_C4_D5 +( + __global const float4 * img, + __global const float4 * tpl, + __global float * res, + int img_rows, + int img_cols, + int tpl_rows, + int tpl_cols, + int res_rows, + int res_cols, + int img_offset, + int tpl_offset, + int res_offset, + int img_step, + int tpl_step, + int res_step +) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + int i,j; + float4 sum = (float4)(0, 0, 0, 0); + img_step /= sizeof(*img); + img_offset /= sizeof(*img); + tpl_step /= sizeof(*tpl); + tpl_offset /= sizeof(*tpl); + res_step /= sizeof(*res); + res_offset /= sizeof(*res); + + int res_idx = mad24(gidy, res_step, res_offset + gidx); + + if(gidx < res_cols && gidy < res_rows) + { + for(i = 0; i < tpl_rows; i ++) + { + // get specific rows of img data + __global const float4 * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset); + __global const float4 * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset); + for(j = 0; j < tpl_cols; j ++) + { + sum = mad(convert_float4(img_ptr[j]), convert_float4(tpl_ptr[j]), sum); + } + } + res[res_idx] = sum.x + sum.y + sum.z + sum.w; + } +} + +////////////////////////////////////////////////// +// CCOFF +__kernel +void matchTemplate_Prepared_CCOFF_C1_D0 +( + __global float * res, + int img_rows, + int img_cols, + int tpl_rows, + int tpl_cols, + int res_rows, + int res_cols, + int res_offset, + int res_step, + __global const uint * img_sums, + int img_sums_offset, + int img_sums_step, + float tpl_sum +) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + + img_sums_offset /= sizeof(*img_sums); + img_sums_step /= sizeof(*img_sums); + res_step /= sizeof(*res); + res_offset /= sizeof(*res); + + int res_idx = mad24(gidy, res_step, res_offset + gidx); + + if(gidx < res_cols && gidy < res_rows) + { + float sum = (float)((img_sums[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums[SUMS_PTR(tpl_cols, 0)]) + -(img_sums[SUMS_PTR(0, tpl_rows)] - img_sums[SUMS_PTR(0, 0)])); + res[res_idx] -= sum * tpl_sum; + } +} +__kernel +void matchTemplate_Prepared_CCOFF_C4_D0 +( + __global float * res, + int img_rows, + int img_cols, + int tpl_rows, + int tpl_cols, + int res_rows, + int res_cols, + int res_offset, + int res_step, + __global const uint * img_sums_c0, + __global const uint * img_sums_c1, + __global const uint * img_sums_c2, + __global const uint * img_sums_c3, + int img_sums_offset, + int img_sums_step, + float tpl_sum_c0, + float tpl_sum_c1, + float tpl_sum_c2, + float tpl_sum_c3 +) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + + img_sums_offset /= sizeof(*img_sums_c0); + img_sums_step /= sizeof(*img_sums_c0); + res_step /= sizeof(*res); + res_offset /= sizeof(*res); + + int res_idx = mad24(gidy, res_step, res_offset + gidx); + + if(gidx < res_cols && gidy < res_rows) + { + float ccorr = res[res_idx]; + ccorr -= tpl_sum_c0*(float)( + (img_sums_c0[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c0[SUMS_PTR(tpl_cols, 0)]) + - (img_sums_c0[SUMS_PTR(0, tpl_rows)] - img_sums_c0[SUMS_PTR(0, 0)])); + ccorr -= tpl_sum_c1*(float)( + (img_sums_c1[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c1[SUMS_PTR(tpl_cols, 0)]) + - (img_sums_c1[SUMS_PTR(0, tpl_rows)] - img_sums_c1[SUMS_PTR(0, 0)])); + ccorr -= tpl_sum_c2*(float)( + (img_sums_c2[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c2[SUMS_PTR(tpl_cols, 0)]) + - (img_sums_c2[SUMS_PTR(0, tpl_rows)] - img_sums_c2[SUMS_PTR(0, 0)])); + ccorr -= tpl_sum_c3*(float)( + (img_sums_c3[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c3[SUMS_PTR(tpl_cols, 0)]) + - (img_sums_c3[SUMS_PTR(0, tpl_rows)] - img_sums_c3[SUMS_PTR(0, 0)])); + res[res_idx] = ccorr; + } +} + +__kernel +void matchTemplate_Prepared_CCOFF_NORMED_C1_D0 +( + __global float * res, + int img_rows, + int img_cols, + int tpl_rows, + int tpl_cols, + int res_rows, + int res_cols, + int res_offset, + int res_step, + float weight, + __global const uint * img_sums, + int img_sums_offset, + int img_sums_step, + __global const float * img_sqsums, + int img_sqsums_offset, + int img_sqsums_step, + float tpl_sum, + float tpl_sqsum +) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + + img_sqsums_step /= sizeof(*img_sqsums); + img_sqsums_offset /= sizeof(*img_sqsums); + img_sums_offset /= sizeof(*img_sums); + img_sums_step /= sizeof(*img_sums); + res_step /= sizeof(*res); + res_offset /= sizeof(*res); + + + int res_idx = mad24(gidy, res_step, res_offset + gidx); + + if(gidx < res_cols && gidy < res_rows) + { + float image_sum_ = (float)( + (img_sums[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums[SUMS_PTR(tpl_cols, 0)]) + - (img_sums[SUMS_PTR(0, tpl_rows)] - img_sums[SUMS_PTR(0, 0)])); + + float image_sqsum_ = (float)( + (img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) - + (img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)])); + res[res_idx] = normAcc(res[res_idx] - image_sum_ * tpl_sum, + sqrt(tpl_sqsum * (image_sqsum_ - weight * image_sum_ * image_sum_))); + } +} +__kernel +void matchTemplate_Prepared_CCOFF_NORMED_C4_D0 +( + __global float * res, + int img_rows, + int img_cols, + int tpl_rows, + int tpl_cols, + int res_rows, + int res_cols, + int res_offset, + int res_step, + float weight, + __global const uint * img_sums_c0, + __global const uint * img_sums_c1, + __global const uint * img_sums_c2, + __global const uint * img_sums_c3, + int img_sums_offset, + int img_sums_step, + __global const float * img_sqsums_c0, + __global const float * img_sqsums_c1, + __global const float * img_sqsums_c2, + __global const float * img_sqsums_c3, + int img_sqsums_offset, + int img_sqsums_step, + float tpl_sum_c0, + float tpl_sum_c1, + float tpl_sum_c2, + float tpl_sum_c3, + float tpl_sqsum +) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + + img_sqsums_step /= sizeof(*img_sqsums_c0); + img_sqsums_offset /= sizeof(*img_sqsums_c0); + img_sums_offset /= sizeof(*img_sums_c0); + img_sums_step /= sizeof(*img_sums_c0); + res_step /= sizeof(*res); + res_offset /= sizeof(*res); + + int res_idx = mad24(gidy, res_step, res_offset + gidx); + + if(gidx < res_cols && gidy < res_rows) + { + float image_sum_c0 = (float)( + (img_sums_c0[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c0[SUMS_PTR(tpl_cols, 0)]) + - (img_sums_c0[SUMS_PTR(0, tpl_rows)] - img_sums_c0[SUMS_PTR(0, 0)])); + float image_sum_c1 = (float)( + (img_sums_c1[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c1[SUMS_PTR(tpl_cols, 0)]) + - (img_sums_c1[SUMS_PTR(0, tpl_rows)] - img_sums_c1[SUMS_PTR(0, 0)])); + float image_sum_c2 = (float)( + (img_sums_c2[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c2[SUMS_PTR(tpl_cols, 0)]) + - (img_sums_c2[SUMS_PTR(0, tpl_rows)] - img_sums_c2[SUMS_PTR(0, 0)])); + float image_sum_c3 = (float)( + (img_sums_c3[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c3[SUMS_PTR(tpl_cols, 0)]) + - (img_sums_c3[SUMS_PTR(0, tpl_rows)] - img_sums_c3[SUMS_PTR(0, 0)])); + + float image_sqsum_c0 = (float)( + (img_sqsums_c0[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c0[SQSUMS_PTR(tpl_cols, 0)]) - + (img_sqsums_c0[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c0[SQSUMS_PTR(0, 0)])); + float image_sqsum_c1 = (float)( + (img_sqsums_c1[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c1[SQSUMS_PTR(tpl_cols, 0)]) - + (img_sqsums_c1[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c1[SQSUMS_PTR(0, 0)])); + float image_sqsum_c2 = (float)( + (img_sqsums_c2[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c2[SQSUMS_PTR(tpl_cols, 0)]) - + (img_sqsums_c2[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c2[SQSUMS_PTR(0, 0)])); + float image_sqsum_c3 = (float)( + (img_sqsums_c3[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c3[SQSUMS_PTR(tpl_cols, 0)]) - + (img_sqsums_c3[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c3[SQSUMS_PTR(0, 0)])); + + float num = res[res_idx] - + image_sum_c0 * tpl_sum_c0 - + image_sum_c1 * tpl_sum_c1 - + image_sum_c2 * tpl_sum_c2 - + image_sum_c3 * tpl_sum_c3; + float denum = sqrt( tpl_sqsum * ( + image_sqsum_c0 - weight * image_sum_c0 * image_sum_c0 + + image_sqsum_c1 - weight * image_sum_c1 * image_sum_c1 + + image_sqsum_c2 - weight * image_sum_c2 * image_sum_c2 + + image_sqsum_c3 - weight * image_sum_c0 * image_sum_c3) + ); + res[res_idx] = normAcc(num, denum); + } +} + +////////////////////////////////////////////////////////////////////// +// extractFirstChannel +__kernel +void extractFirstChannel +( + const __global float4* img, + __global float* res, + int rows, + int cols, + int img_offset, + int res_offset, + int img_step, + int res_step +) +{ + img_step /= sizeof(float4); + res_step /= sizeof(float); + img_offset /= sizeof(float4); + res_offset /= sizeof(float); + img += img_offset; + res += res_offset; + int gidx = get_global_id(0); + int gidy = get_global_id(1); + if(gidx < cols && gidy < rows) + { + res[gidx + gidy * res_step] = img[gidx + gidy * img_step].x; + } +} diff --git a/modules/imgproc/src/opencl/median.cl b/modules/imgproc/src/opencl/median.cl new file mode 100644 index 0000000000..ccb529957b --- /dev/null +++ b/modules/imgproc/src/opencl/median.cl @@ -0,0 +1,486 @@ +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Niko Li, newlife20080214@gmail.com +// Zero Lin, zero.lin@amd.com +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors as is and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +// + + +/* +__kernel void medianFilter_C1(__global uchar * src, __global uchar * dst, int srcOffset, int dstOffset, int cols, + int rows, int srcStep, int dstStep, int m) +{ + int dx = get_global_id(0)-(m>>1); + int dy = get_global_id(1)-(m>>1); + + short histom[256]; + for(int i=0;i<256;++i) + histom[i]=0; + + + for(int i=0;i>1; + int v; + for(int i=0;i<256;++i) + { + v=(now= (r_edge) ? (r_edge)-1 : (addr)) +#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) :(i)) +#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (b_edge)-1 :(addr)) +#endif + +#ifdef BORDER_REFLECT +//BORDER_REFLECT: fedcba|abcdefgh|hgfedcb +#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i)-1 : (i)) +#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr)) +#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i)-1 : (i)) +#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr)) +#endif + +#ifdef BORDER_REFLECT101 +//BORDER_REFLECT101: gfedcb|abcdefgh|gfedcba +#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i) : (i)) +#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr)) +#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i) : (i)) +#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr)) +#endif + +#ifdef BORDER_WRAP +//BORDER_WRAP: cdefgh|abcdefgh|abcdefg +#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (i)+(r_edge) : (i)) +#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (i)-(r_edge) : (addr)) +#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (i)+(b_edge) : (i)) +#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (i)-(b_edge) : (addr)) +#endif + +#define THREADS 256 +#define ELEM(i, l_edge, r_edge, elem1, elem2) (i) >= (l_edge) && (i) < (r_edge) ? (elem1) : (elem2) +/////////////////////////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////calcHarris//////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////////////////////////// +__kernel void calcMinEigenVal(__global const float *Dx,__global const float *Dy, __global float *dst, + int dx_offset, int dx_whole_rows, int dx_whole_cols, int dx_step, + int dy_offset, int dy_whole_rows, int dy_whole_cols, int dy_step, + int dst_offset, int dst_rows, int dst_cols, int dst_step, + float k) +{ + int col = get_local_id(0); + const int gX = get_group_id(0); + const int gY = get_group_id(1); + const int glx = get_global_id(0); + const int gly = get_global_id(1); + + int dx_x_off = (dx_offset % dx_step) >> 2; + int dx_y_off = dx_offset / dx_step; + int dy_x_off = (dy_offset % dy_step) >> 2; + int dy_y_off = dy_offset / dy_step; + int dst_x_off = (dst_offset % dst_step) >> 2; + int dst_y_off = dst_offset / dst_step; + + int dx_startX = gX * (THREADS-ksX+1) - anX + dx_x_off; + int dx_startY = (gY << 1) - anY + dx_y_off; + int dy_startX = gX * (THREADS-ksX+1) - anX + dy_x_off; + int dy_startY = (gY << 1) - anY + dy_y_off; + int dst_startX = gX * (THREADS-ksX+1) + dst_x_off; + int dst_startY = (gY << 1) + dst_y_off; + + float dx_data[ksY+1],dy_data[ksY+1],data[3][ksY+1]; + __local float temp[6][THREADS]; +#ifdef BORDER_CONSTANT + bool dx_con,dy_con; + float dx_s,dy_s; + for(int i=0; i < ksY+1; i++) + { + dx_con = dx_startX+col >= 0 && dx_startX+col < dx_whole_cols && dx_startY+i >= 0 && dx_startY+i < dx_whole_rows; + dx_s = Dx[(dx_startY+i)*(dx_step>>2)+(dx_startX+col)]; + dx_data[i] = dx_con ? dx_s : 0.0; + dy_con = dy_startX+col >= 0 && dy_startX+col < dy_whole_cols && dy_startY+i >= 0 && dy_startY+i < dy_whole_rows; + dy_s = Dy[(dy_startY+i)*(dy_step>>2)+(dy_startX+col)]; + dy_data[i] = dy_con ? dy_s : 0.0; + data[0][i] = dx_data[i] * dx_data[i]; + data[1][i] = dx_data[i] * dy_data[i]; + data[2][i] = dy_data[i] * dy_data[i]; + } +#else + int clamped_col = min(dst_cols, col); + + for(int i=0; i < ksY+1; i++) + { + int dx_selected_row; + int dx_selected_col; + dx_selected_row = ADDR_H(dx_startY+i, 0, dx_whole_rows); + dx_selected_row = ADDR_B(dx_startY+i, dx_whole_rows, dx_selected_row); + dx_selected_col = ADDR_L(dx_startX+clamped_col, 0, dx_whole_cols); + dx_selected_col = ADDR_R(dx_startX+clamped_col, dx_whole_cols, dx_selected_col); + dx_data[i] = Dx[dx_selected_row * (dx_step>>2) + dx_selected_col]; + + int dy_selected_row; + int dy_selected_col; + dy_selected_row = ADDR_H(dy_startY+i, 0, dy_whole_rows); + dy_selected_row = ADDR_B(dy_startY+i, dy_whole_rows, dy_selected_row); + dy_selected_col = ADDR_L(dy_startX+clamped_col, 0, dy_whole_cols); + dy_selected_col = ADDR_R(dy_startX+clamped_col, dy_whole_cols, dy_selected_col); + dy_data[i] = Dy[dy_selected_row * (dy_step>>2) + dy_selected_col]; + + data[0][i] = dx_data[i] * dx_data[i]; + data[1][i] = dx_data[i] * dy_data[i]; + data[2][i] = dy_data[i] * dy_data[i]; + } +#endif + float sum0 = 0.0, sum1 = 0.0, sum2 = 0.0; + for(int i=1; i < ksY; i++) + { + sum0 += (data[0][i]); + sum1 += (data[1][i]); + sum2 += (data[2][i]); + } + float sum01,sum02,sum11,sum12,sum21,sum22; + sum01 = sum0 + (data[0][0]); + sum02 = sum0 + (data[0][ksY]); + temp[0][col] = sum01; + temp[1][col] = sum02; + sum11 = sum1 + (data[1][0]); + sum12 = sum1 + (data[1][ksY]); + temp[2][col] = sum11; + temp[3][col] = sum12; + sum21 = sum2 + (data[2][0]); + sum22 = sum2 + (data[2][ksY]); + temp[4][col] = sum21; + temp[5][col] = sum22; + barrier(CLK_LOCAL_MEM_FENCE); + if(col < (THREADS-(ksX-1))) + { + col += anX; + int posX = dst_startX - dst_x_off + col - anX; + int posY = (gly << 1); + int till = (ksX + 1)%2; + float tmp_sum[6]={ 0.0, 0.0 , 0.0, 0.0, 0.0, 0.0 }; + for(int k=0; k<6; k++) + for(int i=-anX; i<=anX - till; i++) + { + tmp_sum[k] += temp[k][col+i]; + } + + if(posX < dst_cols && (posY) < dst_rows) + { + float a = tmp_sum[0] * 0.5f; + float b = tmp_sum[2]; + float c = tmp_sum[4] * 0.5f; + dst[(dst_startY+0) * (dst_step>>2)+ dst_startX + col - anX] = (float)((a+c) - sqrt((a-c)*(a-c) + b*b)); + } + if(posX < dst_cols && (posY + 1) < dst_rows) + { + float a = tmp_sum[1] * 0.5f; + float b = tmp_sum[3]; + float c = tmp_sum[5] * 0.5f; + dst[(dst_startY+1) * (dst_step>>2)+ dst_startX + col - anX] = (float)((a+c) - sqrt((a-c)*(a-c) + b*b)); + } + } +} diff --git a/modules/imgproc/src/opencl/moments.cl b/modules/imgproc/src/opencl/moments.cl new file mode 100644 index 0000000000..d61b8d5ae7 --- /dev/null +++ b/modules/imgproc/src/opencl/moments.cl @@ -0,0 +1,980 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Sen Liu, swjtuls1987@126.com +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors as is and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if defined (DOUBLE_SUPPORT) + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64:enable +#elif defined (cl_amd_fp64) +#pragma OPENCL EXTENSION cl_amd_fp64:enable +#endif +typedef double T; +typedef double F; +typedef double4 F4; +#define convert_F4 convert_double4 + +#else +typedef float F; +typedef float4 F4; +typedef long T; +#define convert_F4 convert_float4 +#endif + +#define DST_ROW_00 0 +#define DST_ROW_10 1 +#define DST_ROW_01 2 +#define DST_ROW_20 3 +#define DST_ROW_11 4 +#define DST_ROW_02 5 +#define DST_ROW_30 6 +#define DST_ROW_21 7 +#define DST_ROW_12 8 +#define DST_ROW_03 9 + +__kernel void icvContourMoments(int contour_total, + __global float* reader_oclmat_data, + __global T* dst_a, + int dst_step) +{ + T xi_1, yi_1, xi_12, yi_12, xi, yi, xi2, yi2, dxy, xii_1, yii_1; + int idx = get_global_id(0); + + if (idx < 0 || idx >= contour_total) + return; + + xi_1 = (T)(*(reader_oclmat_data + (get_global_id(0) << 1))); + yi_1 = (T)(*(reader_oclmat_data + (get_global_id(0) << 1) + 1)); + xi_12 = xi_1 * xi_1; + yi_12 = yi_1 * yi_1; + + if(idx == contour_total - 1) + { + xi = (T)(*(reader_oclmat_data)); + yi = (T)(*(reader_oclmat_data + 1)); + } + else + { + xi = (T)(*(reader_oclmat_data + (idx + 1) * 2)); + yi = (T)(*(reader_oclmat_data + (idx + 1) * 2 + 1)); + } + + xi2 = xi * xi; + yi2 = yi * yi; + dxy = xi_1 * yi - xi * yi_1; + xii_1 = xi_1 + xi; + yii_1 = yi_1 + yi; + + dst_step /= sizeof(T); + *( dst_a + DST_ROW_00 * dst_step + idx) = dxy; + *( dst_a + DST_ROW_10 * dst_step + idx) = dxy * xii_1; + *( dst_a + DST_ROW_01 * dst_step + idx) = dxy * yii_1; + *( dst_a + DST_ROW_20 * dst_step + idx) = dxy * (xi_1 * xii_1 + xi2); + *( dst_a + DST_ROW_11 * dst_step + idx) = dxy * (xi_1 * (yii_1 + yi_1) + xi * (yii_1 + yi)); + *( dst_a + DST_ROW_02 * dst_step + idx) = dxy * (yi_1 * yii_1 + yi2); + *( dst_a + DST_ROW_30 * dst_step + idx) = dxy * xii_1 * (xi_12 + xi2); + *( dst_a + DST_ROW_03 * dst_step + idx) = dxy * yii_1 * (yi_12 + yi2); + *( dst_a + DST_ROW_21 * dst_step + idx) = + dxy * (xi_12 * (3 * yi_1 + yi) + 2 * xi * xi_1 * yii_1 + + xi2 * (yi_1 + 3 * yi)); + *( dst_a + DST_ROW_12 * dst_step + idx) = + dxy * (yi_12 * (3 * xi_1 + xi) + 2 * yi * yi_1 * xii_1 + + yi2 * (xi_1 + 3 * xi)); +} + +__kernel void dst_sum(int src_rows, int src_cols, int tile_height, int tile_width, int TILE_SIZE, + __global F* sum, __global F* dst_m, int dst_step) +{ + int gidy = get_global_id(0); + int gidx = get_global_id(1); + int block_y = src_rows/tile_height; + int block_x = src_cols/tile_width; + int block_num; + + if(src_rows > TILE_SIZE && src_rows % TILE_SIZE != 0) + block_y ++; + if(src_cols > TILE_SIZE && src_cols % TILE_SIZE != 0) + block_x ++; + block_num = block_y * block_x; + __local F dst_sum[10][128]; + if(gidy<128-block_num) + for(int i=0; i<10; i++) + dst_sum[i][gidy+block_num]=0; + barrier(CLK_LOCAL_MEM_FENCE); + + dst_step /= sizeof(F); + if(gidy0; lsize>>=1) + { + if(gidy 0 ) //channel of interest + for(int i = 0; i < tileSize_width; i += VLEN_C) + { + for(int j=0; j= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height) + { + m[9][lidy-bheight] = ((int)py) * sy; // m03 + m[8][lidy-bheight] = ((int)x1.s0) * sy; // m12 + m[7][lidy-bheight] = ((int)x2.s0) * lidy; // m21 + m[6][lidy-bheight] = x3.s0; // m30 + m[5][lidy-bheight] = x0.s0 * sy; // m02 + m[4][lidy-bheight] = x1.s0 * lidy; // m11 + m[3][lidy-bheight] = x2.s0; // m20 + m[2][lidy-bheight] = py; // m01 + m[1][lidy-bheight] = x1.s0; // m10 + m[0][lidy-bheight] = x0.s0; // m00 + } + else if(lidy < bheight) + { + lm[9] = ((int)py) * sy; // m03 + lm[8] = ((int)x1.s0) * sy; // m12 + lm[7] = ((int)x2.s0) * lidy; // m21 + lm[6] = x3.s0; // m30 + lm[5] = x0.s0 * sy; // m02 + lm[4] = x1.s0 * lidy; // m11 + lm[3] = x2.s0; // m20 + lm[2] = py; // m01 + lm[1] = x1.s0; // m10 + lm[0] = x0.s0; // m00 + } + barrier(CLK_LOCAL_MEM_FENCE); + for( int j = bheight; j >= 1; j = j/2 ) + { + if(lidy < j) + for( int i = 0; i < 10; i++ ) + lm[i] = lm[i] + m[i][lidy]; + barrier(CLK_LOCAL_MEM_FENCE); + if(lidy >= j/2&&lidy < j) + for( int i = 0; i < 10; i++ ) + m[i][lidy-j/2] = lm[i]; + barrier(CLK_LOCAL_MEM_FENCE); + } + + if(lidy == 0&&lidx == 0) + { + for( int mt = 0; mt < 10; mt++ ) + mom[mt] = (F)lm[mt]; + if(binary) + { + F s = 1./255; + for( int mt = 0; mt < 10; mt++ ) + mom[mt] *= s; + } + F xm = x * mom[0], ym = y * mom[0]; + + // accumulate moments computed in each tile + dst_step /= sizeof(F); + + // + m00 ( = m00' ) + *(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0]; + + // + m10 ( = m10' + x*m00' ) + *(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm; + + // + m01 ( = m01' + y*m00' ) + *(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym; + + // + m20 ( = m20' + 2*x*m10' + x*x*m00' ) + *(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm); + + // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' ) + *(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1]; + + // + m02 ( = m02' + 2*y*m01' + y*y*m00' ) + *(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym); + + // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' ) + *(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm)); + + // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20') + *(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3]; + + // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02') + *(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5]; + + // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' ) + *(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym)); + } +} + +__kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_cols, int src_step, + __global F* dst_m, + int dst_cols, int dst_step, int blocky, + int depth, int cn, int coi, int binary, const int TILE_SIZE) +{ + ushort tmp_coi[8]; // get the coi data + ushort8 tmp[32]; + int VLEN_US = 8; // vector length of ushort + int gidy = get_global_id(0); + int gidx = get_global_id(1); + int wgidy = get_group_id(0); + int wgidx = get_group_id(1); + int lidy = get_local_id(0); + int lidx = get_local_id(1); + int y = wgidy*TILE_SIZE; // real Y index of pixel + int x = wgidx*TILE_SIZE; // real X index of pixel + int kcn = (cn==2)?2:4; + int rstep = min(src_step/2, TILE_SIZE); + int tileSize_height = min(TILE_SIZE, src_rows - y); + int tileSize_width = min(TILE_SIZE, src_cols -x); + + if ( y+lidy < src_rows ) + { + if(src_cols > TILE_SIZE && tileSize_width < TILE_SIZE) + for(int i=tileSize_width; i < rstep && (x+i) < src_cols; i++ ) + *((__global ushort*)src_data+(y+lidy)*src_step/2+x+i) = 0; + if( coi > 0 ) + for(int i=0; i < tileSize_width; i+=VLEN_US) + { + for(int j=0; j= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height) + { + m[9][lidy-bheight] = ((long)py) * sy; // m03 + m[8][lidy-bheight] = ((long)x1.s0) * sy; // m12 + m[7][lidy-bheight] = ((long)x2.s0) * lidy; // m21 + m[6][lidy-bheight] = x3.s0; // m30 + m[5][lidy-bheight] = x0.s0 * sy; // m02 + m[4][lidy-bheight] = x1.s0 * lidy; // m11 + m[3][lidy-bheight] = x2.s0; // m20 + m[2][lidy-bheight] = py; // m01 + m[1][lidy-bheight] = x1.s0; // m10 + m[0][lidy-bheight] = x0.s0; // m00 + } + else if(lidy < bheight) + { + lm[9] = ((long)py) * sy; // m03 + lm[8] = ((long)x1.s0) * sy; // m12 + lm[7] = ((long)x2.s0) * lidy; // m21 + lm[6] = x3.s0; // m30 + lm[5] = x0.s0 * sy; // m02 + lm[4] = x1.s0 * lidy; // m11 + lm[3] = x2.s0; // m20 + lm[2] = py; // m01 + lm[1] = x1.s0; // m10 + lm[0] = x0.s0; // m00 + } + barrier(CLK_LOCAL_MEM_FENCE); + + for( int j = TILE_SIZE/2; j >= 1; j = j/2 ) + { + if(lidy < j) + for( int i = 0; i < 10; i++ ) + lm[i] = lm[i] + m[i][lidy]; + } + barrier(CLK_LOCAL_MEM_FENCE); + for( int j = TILE_SIZE/2; j >= 1; j = j/2 ) + { + if(lidy >= j/2&&lidy < j) + for( int i = 0; i < 10; i++ ) + m[i][lidy-j/2] = lm[i]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if(lidy == 0&&lidx == 0) + { + for(int mt = 0; mt < 10; mt++ ) + mom[mt] = (F)lm[mt]; + + if(binary) + { + F s = 1./255; + for( int mt = 0; mt < 10; mt++ ) + mom[mt] *= s; + } + + F xm = x *mom[0], ym = y * mom[0]; + + // accumulate moments computed in each tile + dst_step /= sizeof(F); + + // + m00 ( = m00' ) + *(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0]; + + // + m10 ( = m10' + x*m00' ) + *(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm; + + // + m01 ( = m01' + y*m00' ) + *(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym; + + // + m20 ( = m20' + 2*x*m10' + x*x*m00' ) + *(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm); + + // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' ) + *(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1]; + + // + m02 ( = m02' + 2*y*m01' + y*y*m00' ) + *(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym); + + // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' ) + *(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm)); + + // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20') + *(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3]; + + // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02') + *(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5]; + + // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' ) + *(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym)); + } +} + +__kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols, int src_step, + __global F* dst_m, + int dst_cols, int dst_step, int blocky, + int depth, int cn, int coi, int binary, const int TILE_SIZE) +{ + short tmp_coi[8]; // get the coi data + short8 tmp[32]; + int VLEN_S =8; // vector length of short + int gidy = get_global_id(0); + int gidx = get_global_id(1); + int wgidy = get_group_id(0); + int wgidx = get_group_id(1); + int lidy = get_local_id(0); + int lidx = get_local_id(1); + int y = wgidy*TILE_SIZE; // real Y index of pixel + int x = wgidx*TILE_SIZE; // real X index of pixel + int kcn = (cn==2)?2:4; + int rstep = min(src_step/2, TILE_SIZE); + int tileSize_height = min(TILE_SIZE, src_rows - y); + int tileSize_width = min(TILE_SIZE, src_cols -x); + + if ( y+lidy < src_rows ) + { + if(tileSize_width < TILE_SIZE) + for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ ) + *((__global short*)src_data+(y+lidy)*src_step/2+x+i) = 0; + if( coi > 0 ) + for(int i=0; i < tileSize_width; i+=VLEN_S) + { + for(int j=0; j= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height) + { + m[9][lidy-bheight] = ((long)py) * sy; // m03 + m[8][lidy-bheight] = ((long)x1.s0) * sy; // m12 + m[7][lidy-bheight] = ((long)x2.s0) * lidy; // m21 + m[6][lidy-bheight] = x3.s0; // m30 + m[5][lidy-bheight] = x0.s0 * sy; // m02 + m[4][lidy-bheight] = x1.s0 * lidy; // m11 + m[3][lidy-bheight] = x2.s0; // m20 + m[2][lidy-bheight] = py; // m01 + m[1][lidy-bheight] = x1.s0; // m10 + m[0][lidy-bheight] = x0.s0; // m00 + } + else if(lidy < bheight) + { + lm[9] = ((long)py) * sy; // m03 + lm[8] = ((long)(x1.s0)) * sy; // m12 + lm[7] = ((long)(x2.s0)) * lidy; // m21 + lm[6] = x3.s0; // m30 + lm[5] = x0.s0 * sy; // m02 + lm[4] = x1.s0 * lidy; // m11 + lm[3] = x2.s0; // m20 + lm[2] = py; // m01 + lm[1] = x1.s0; // m10 + lm[0] = x0.s0; // m00 + } + barrier(CLK_LOCAL_MEM_FENCE); + for( int j = TILE_SIZE/2; j >=1; j = j/2 ) + { + if(lidy < j) + for( int i = 0; i < 10; i++ ) + lm[i] = lm[i] + m[i][lidy]; + barrier(CLK_LOCAL_MEM_FENCE); + if(lidy >= j/2&&lidy < j) + for( int i = 0; i < 10; i++ ) + m[i][lidy-j/2] = lm[i]; + barrier(CLK_LOCAL_MEM_FENCE); + } + if(lidy ==0 &&lidx ==0) + { + for(int mt = 0; mt < 10; mt++ ) + mom[mt] = (F)lm[mt]; + + if(binary) + { + F s = 1./255; + for( int mt = 0; mt < 10; mt++ ) + mom[mt] *= s; + } + + F xm = x * mom[0], ym = y*mom[0]; + + // accumulate moments computed in each tile + dst_step /= sizeof(F); + + // + m00 ( = m00' ) + *(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0]; + + // + m10 ( = m10' + x*m00' ) + *(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm; + + // + m01 ( = m01' + y*m00' ) + *(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym; + + // + m20 ( = m20' + 2*x*m10' + x*x*m00' ) + *(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm); + + // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' ) + *(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1]; + + // + m02 ( = m02' + 2*y*m01' + y*y*m00' ) + *(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym); + + // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' ) + *(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm)); + + // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20') + *(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3]; + + // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02') + *(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5]; + + // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' ) + *(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym)); + } +} + +__kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols, int src_step, + __global F* dst_m, + int dst_cols, int dst_step, int blocky, + int depth, int cn, int coi, int binary, const int TILE_SIZE) +{ + float tmp_coi[4]; // get the coi data + float4 tmp[64] ; + int VLEN_F = 4; // vector length of float + int gidy = get_global_id(0); + int gidx = get_global_id(1); + int wgidy = get_group_id(0); + int wgidx = get_group_id(1); + int lidy = get_local_id(0); + int lidx = get_local_id(1); + int y = wgidy*TILE_SIZE; // real Y index of pixel + int x = wgidx*TILE_SIZE; // real X index of pixel + int kcn = (cn==2)?2:4; + int rstep = min(src_step/4, TILE_SIZE); + int tileSize_height = min(TILE_SIZE, src_rows - y); + int tileSize_width = min(TILE_SIZE, src_cols -x); + int maxIdx = mul24(src_rows, src_cols); + int yOff = (y+lidy)*src_step; + int index; + + if ( y+lidy < src_rows ) + { + if(tileSize_width < TILE_SIZE) + for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ ) + *((__global float*)src_data+(y+lidy)*src_step/4+x+i) = 0; + if( coi > 0 ) + for(int i=0; i < tileSize_width; i+=VLEN_F) + { + for(int j=0; j<4; j++) + tmp_coi[j] = *(src_data+(y+lidy)*src_step/4+(x+i+j)*kcn+coi-1); + tmp[i/VLEN_F] = (float4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]); + } + else + for(int i=0; i < tileSize_width; i+=VLEN_F) + tmp[i/VLEN_F] = (float4)(*(src_data+(y+lidy)*src_step/4+x+i),*(src_data+(y+lidy)*src_step/4+x+i+1),*(src_data+(y+lidy)*src_step/4+x+i+2),*(src_data+(y+lidy)*src_step/4+x+i+3)); + } + + float4 zero = (float4)(0); + float4 full = (float4)(255); + if( binary ) + for(int i=0; i < tileSize_width; i+=4) + tmp[i/VLEN_F] = (tmp[i/VLEN_F]!=zero)?full:zero; + F mom[10]; + __local F m[10][128]; + if(lidy < 128) + for(int i = 0; i < 10; i ++) + m[i][lidy] = 0; + barrier(CLK_LOCAL_MEM_FENCE); + F lm[10] = {0}; + F4 x0 = (F4)(0); + F4 x1 = (F4)(0); + F4 x2 = (F4)(0); + F4 x3 = (F4)(0); + for( int xt = 0 ; xt < tileSize_width; xt+=VLEN_F ) + { + F4 v_xt = (F4)(xt, xt+1, xt+2, xt+3); + F4 p = convert_F4(tmp[xt/VLEN_F]); + F4 xp = v_xt * p, xxp = xp * v_xt; + x0 += p; + x1 += xp; + x2 += xxp; + x3 += xxp * v_xt; + } + x0.s0 += x0.s1 + x0.s2 + x0.s3; + x1.s0 += x1.s1 + x1.s2 + x1.s3; + x2.s0 += x2.s1 + x2.s2 + x2.s3; + x3.s0 += x3.s1 + x3.s2 + x3.s3; + + F py = lidy * x0.s0, sy = lidy*lidy; + int bheight = min(tileSize_height, TILE_SIZE/2); + if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height) + { + m[9][lidy-bheight] = ((F)py) * sy; // m03 + m[8][lidy-bheight] = ((F)x1.s0) * sy; // m12 + m[7][lidy-bheight] = ((F)x2.s0) * lidy; // m21 + m[6][lidy-bheight] = x3.s0; // m30 + m[5][lidy-bheight] = x0.s0 * sy; // m02 + m[4][lidy-bheight] = x1.s0 * lidy; // m11 + m[3][lidy-bheight] = x2.s0; // m20 + m[2][lidy-bheight] = py; // m01 + m[1][lidy-bheight] = x1.s0; // m10 + m[0][lidy-bheight] = x0.s0; // m00 + } + + else if(lidy < bheight) + { + lm[9] = ((F)py) * sy; // m03 + lm[8] = ((F)x1.s0) * sy; // m12 + lm[7] = ((F)x2.s0) * lidy; // m21 + lm[6] = x3.s0; // m30 + lm[5] = x0.s0 * sy; // m02 + lm[4] = x1.s0 * lidy; // m11 + lm[3] = x2.s0; // m20 + lm[2] = py; // m01 + lm[1] = x1.s0; // m10 + lm[0] = x0.s0; // m00 + } + barrier(CLK_LOCAL_MEM_FENCE); + for( int j = TILE_SIZE/2; j >= 1; j = j/2 ) + { + if(lidy < j) + for( int i = 0; i < 10; i++ ) + lm[i] = lm[i] + m[i][lidy]; + barrier(CLK_LOCAL_MEM_FENCE); + if(lidy >= j/2&&lidy < j) + for( int i = 0; i < 10; i++ ) + m[i][lidy-j/2] = lm[i]; + barrier(CLK_LOCAL_MEM_FENCE); + } + if(lidy == 0&&lidx == 0) + { + for( int mt = 0; mt < 10; mt++ ) + mom[mt] = (F)lm[mt]; + if(binary) + { + F s = 1./255; + for( int mt = 0; mt < 10; mt++ ) + mom[mt] *= s; + } + + F xm = x * mom[0], ym = y * mom[0]; + + // accumulate moments computed in each tile + dst_step /= sizeof(F); + + // + m00 ( = m00' ) + *(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0]; + + // + m10 ( = m10' + x*m00' ) + *(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm; + + // + m01 ( = m01' + y*m00' ) + *(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym; + + // + m20 ( = m20' + 2*x*m10' + x*x*m00' ) + *(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm); + + // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' ) + *(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1]; + + // + m02 ( = m02' + 2*y*m01' + y*y*m00' ) + *(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym); + + // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' ) + *(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm)); + + // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20') + *(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3]; + + // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02') + *(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5]; + + // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' ) + *(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym)); + } +} + +__kernel void CvMoments_D6(__global F* src_data, int src_rows, int src_cols, int src_step, + __global F* dst_m, + int dst_cols, int dst_step, int blocky, + int depth, int cn, int coi, int binary, const int TILE_SIZE) +{ + F tmp_coi[4]; // get the coi data + F4 tmp[64]; + int VLEN_D = 4; // length of vetor + int gidy = get_global_id(0); + int gidx = get_global_id(1); + int wgidy = get_group_id(0); + int wgidx = get_group_id(1); + int lidy = get_local_id(0); + int lidx = get_local_id(1); + int y = wgidy*TILE_SIZE; // real Y index of pixel + int x = wgidx*TILE_SIZE; // real X index of pixel + int kcn = (cn==2)?2:4; + int rstep = min(src_step/8, TILE_SIZE); + int tileSize_height = min(TILE_SIZE, src_rows - y); + int tileSize_width = min(TILE_SIZE, src_cols - x); + + if ( y+lidy < src_rows ) + { + if(tileSize_width < TILE_SIZE) + for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ ) + *((__global F*)src_data+(y+lidy)*src_step/8+x+i) = 0; + if( coi > 0 ) + for(int i=0; i < tileSize_width; i+=VLEN_D) + { + for(int j=0; j<4 && ((x+i+j)*kcn+coi-1)= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height) + { + m[9][lidy-bheight] = ((F)py) * sy; // m03 + m[8][lidy-bheight] = ((F)x1.s0) * sy; // m12 + m[7][lidy-bheight] = ((F)x2.s0) * lidy; // m21 + m[6][lidy-bheight] = x3.s0; // m30 + m[5][lidy-bheight] = x0.s0 * sy; // m02 + m[4][lidy-bheight] = x1.s0 * lidy; // m11 + m[3][lidy-bheight] = x2.s0; // m20 + m[2][lidy-bheight] = py; // m01 + m[1][lidy-bheight] = x1.s0; // m10 + m[0][lidy-bheight] = x0.s0; // m00 + } + else if(lidy < bheight) + { + lm[9] = ((F)py) * sy; // m03 + lm[8] = ((F)x1.s0) * sy; // m12 + lm[7] = ((F)x2.s0) * lidy; // m21 + lm[6] = x3.s0; // m30 + lm[5] = x0.s0 * sy; // m02 + lm[4] = x1.s0 * lidy; // m11 + lm[3] = x2.s0; // m20 + lm[2] = py; // m01 + lm[1] = x1.s0; // m10 + lm[0] = x0.s0; // m00 + } + barrier(CLK_LOCAL_MEM_FENCE); + + for( int j = TILE_SIZE/2; j >= 1; j = j/2 ) + { + if(lidy < j) + for( int i = 0; i < 10; i++ ) + lm[i] = lm[i] + m[i][lidy]; + barrier(CLK_LOCAL_MEM_FENCE); + if(lidy >= j/2&&lidy < j) + for( int i = 0; i < 10; i++ ) + m[i][lidy-j/2] = lm[i]; + barrier(CLK_LOCAL_MEM_FENCE); + } + if(lidy == 0&&lidx == 0) + { + for( int mt = 0; mt < 10; mt++ ) + mom[mt] = (F)lm[mt]; + if(binary) + { + F s = 1./255; + for( int mt = 0; mt < 10; mt++ ) + mom[mt] *= s; + } + + F xm = x * mom[0], ym = y * mom[0]; + + // accumulate moments computed in each tile + dst_step /= sizeof(F); + + // + m00 ( = m00' ) + *(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0]; + + // + m10 ( = m10' + x*m00' ) + *(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm; + + // + m01 ( = m01' + y*m00' ) + *(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym; + + // + m20 ( = m20' + 2*x*m10' + x*x*m00' ) + *(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm); + + // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' ) + *(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1]; + + // + m02 ( = m02' + 2*y*m01' + y*y*m00' ) + *(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym); + + // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' ) + *(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm)); + + // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20') + *(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3]; + + // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02') + *(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5]; + + // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' ) + *(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym)); + } +} diff --git a/modules/imgproc/src/opencl/morph.cl b/modules/imgproc/src/opencl/morph.cl new file mode 100644 index 0000000000..c402ff7210 --- /dev/null +++ b/modules/imgproc/src/opencl/morph.cl @@ -0,0 +1,228 @@ +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Niko Li, newlife20080214@gmail.com +// Zero Lin, zero.lin@amd.com +// Yao Wang, bitwangyaoyao@gmail.com +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors as is and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +// + + +#ifdef ERODE +#define MORPH_OP(A,B) min((A),(B)) +#endif +#ifdef DILATE +#define MORPH_OP(A,B) max((A),(B)) +#endif +//BORDER_CONSTANT: iiiiii|abcdefgh|iiiiiii +#define ELEM(i,l_edge,r_edge,elem1,elem2) (i)<(l_edge) | (i) >= (r_edge) ? (elem1) : (elem2) +#ifndef GENTYPE + +__kernel void morph_C1_D0(__global const uchar * restrict src, + __global uchar *dst, + int src_offset_x, int src_offset_y, + int cols, int rows, + int src_step_in_pixel, int dst_step_in_pixel, + __constant uchar * mat_kernel, + int src_whole_cols, int src_whole_rows, + int dst_offset_in_pixel) +{ + int l_x = get_local_id(0); + int l_y = get_local_id(1); + int x = get_group_id(0)*4*LSIZE0; + int y = get_group_id(1)*LSIZE1; + int start_x = x+src_offset_x-RADIUSX & 0xfffffffc; + int end_x = x + src_offset_x+LSIZE0*4+RADIUSX & 0xfffffffc; + int width = (end_x -start_x+4)>>2; + int offset = src_offset_x-RADIUSX & 3; + int start_y = y+src_offset_y-RADIUSY; + int point1 = mad24(l_y,LSIZE0,l_x); + int point2 = point1 + LSIZE0*LSIZE1; + int tl_x = (point1 % width)<<2; + int tl_y = point1 / width; + int tl_x2 = (point2 % width)<<2; + int tl_y2 = point2 / width; + int cur_x = start_x + tl_x; + int cur_y = start_y + tl_y; + int cur_x2 = start_x + tl_x2; + int cur_y2 = start_y + tl_y2; + int start_addr = mad24(cur_y,src_step_in_pixel,cur_x); + int start_addr2 = mad24(cur_y2,src_step_in_pixel,cur_x2); + uchar4 temp0,temp1; + __local uchar4 LDS_DAT[2*LSIZE1*LSIZE0]; + + int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols); + //read pixels from src + start_addr = ((start_addr < end_addr) && (start_addr > 0)) ? start_addr : 0; + start_addr2 = ((start_addr2 < end_addr) && (start_addr2 > 0)) ? start_addr2 : 0; + temp0 = *(__global uchar4*)&src[start_addr]; + temp1 = *(__global uchar4*)&src[start_addr2]; + //judge if read out of boundary + temp0.x= ELEM(cur_x,0,src_whole_cols,VAL,temp0.x); + temp0.y= ELEM(cur_x+1,0,src_whole_cols,VAL,temp0.y); + temp0.z= ELEM(cur_x+2,0,src_whole_cols,VAL,temp0.z); + temp0.w= ELEM(cur_x+3,0,src_whole_cols,VAL,temp0.w); + temp0= ELEM(cur_y,0,src_whole_rows,(uchar4)VAL,temp0); + + temp1.x= ELEM(cur_x2,0,src_whole_cols,VAL,temp1.x); + temp1.y= ELEM(cur_x2+1,0,src_whole_cols,VAL,temp1.y); + temp1.z= ELEM(cur_x2+2,0,src_whole_cols,VAL,temp1.z); + temp1.w= ELEM(cur_x2+3,0,src_whole_cols,VAL,temp1.w); + temp1= ELEM(cur_y2,0,src_whole_rows,(uchar4)VAL,temp1); + + LDS_DAT[point1] = temp0; + LDS_DAT[point2] = temp1; + barrier(CLK_LOCAL_MEM_FENCE); + uchar4 res = (uchar4)VAL; + + for(int i=0; i<2*RADIUSY+1; i++) + for(int j=0; j<2*RADIUSX+1; j++) + { + res = +#ifndef RECTKERNEL + mat_kernel[i*(2*RADIUSX+1)+j] ? +#endif + MORPH_OP(res,vload4(0,(__local uchar*)&LDS_DAT[mad24((l_y+i),width,l_x)]+offset+j)) +#ifndef RECTKERNEL + :res +#endif + ; + } + + int gidx = get_global_id(0)<<2; + int gidy = get_global_id(1); + int out_addr = mad24(gidy,dst_step_in_pixel,gidx+dst_offset_in_pixel); + + if(gidx+3 0)) ? start_addr : 0; + start_addr2 = ((start_addr2 < end_addr) && (start_addr2 > 0)) ? start_addr2 : 0; + temp0 = src[start_addr]; + temp1 = src[start_addr2]; + //judge if read out of boundary + temp0= ELEM(cur_x,0,src_whole_cols,(GENTYPE)VAL,temp0); + temp0= ELEM(cur_y,0,src_whole_rows,(GENTYPE)VAL,temp0); + + temp1= ELEM(cur_x2,0,src_whole_cols,(GENTYPE)VAL,temp1); + temp1= ELEM(cur_y2,0,src_whole_rows,(GENTYPE)VAL,temp1); + + LDS_DAT[point1] = temp0; + LDS_DAT[point2] = temp1; + barrier(CLK_LOCAL_MEM_FENCE); + GENTYPE res = (GENTYPE)VAL; + for(int i=0; i<2*RADIUSY+1; i++) + for(int j=0; j<2*RADIUSX+1; j++) + { + res = +#ifndef RECTKERNEL + mat_kernel[i*(2*RADIUSX+1)+j] ? +#endif + MORPH_OP(res,LDS_DAT[mad24(l_y+i,width,l_x+j)]) +#ifndef RECTKERNEL + :res +#endif + ; + } + int gidx = get_global_id(0); + int gidy = get_global_id(1); + int out_addr = mad24(gidy,dst_step_in_pixel,gidx+dst_offset_in_pixel); + if(gidx= 2 && src_y < srcRows - 2 && x >= 2 && x < srcCols - 2) + { + sum = 0.0625f * (((srcData + (src_y - 2) * srcStep))[x]); + sum = sum + 0.25f * (((srcData + (src_y - 1) * srcStep))[x]); + sum = sum + 0.375f * (((srcData + (src_y ) * srcStep))[x]); + sum = sum + 0.25f * (((srcData + (src_y + 1) * srcStep))[x]); + sum = sum + 0.0625f * (((srcData + (src_y + 2) * srcStep))[x]); + + smem[2 + get_local_id(0)] = sum; + + if (get_local_id(0) < 2) + { + const int left_x = x - 2; + + sum = 0.0625f * (((srcData + (src_y - 2) * srcStep))[left_x]); + sum = sum + 0.25f * (((srcData + (src_y - 1) * srcStep))[left_x]); + sum = sum + 0.375f * (((srcData + (src_y ) * srcStep))[left_x]); + sum = sum + 0.25f * (((srcData + (src_y + 1) * srcStep))[left_x]); + sum = sum + 0.0625f * (((srcData + (src_y + 2) * srcStep))[left_x]); + + smem[get_local_id(0)] = sum; + } + + if (get_local_id(0) > 253) + { + const int right_x = x + 2; + + sum = 0.0625f * (((srcData + (src_y - 2) * srcStep))[right_x]); + sum = sum + 0.25f * (((srcData + (src_y - 1) * srcStep))[right_x]); + sum = sum + 0.375f * (((srcData + (src_y ) * srcStep))[right_x]); + sum = sum + 0.25f * (((srcData + (src_y + 1) * srcStep))[right_x]); + sum = sum + 0.0625f * (((srcData + (src_y + 2) * srcStep))[right_x]); + + smem[4 + get_local_id(0)] = sum; + } + } + else + { + int col = idx_col(x, last_col); + + sum = 0.0625f * (((srcData + idx_row(src_y - 2, last_row) * srcStep))[col]); + sum = sum + 0.25f * (((srcData + idx_row(src_y - 1, last_row) * srcStep))[col]); + sum = sum + 0.375f * (((srcData + idx_row(src_y , last_row) * srcStep))[col]); + sum = sum + 0.25f * (((srcData + idx_row(src_y + 1, last_row) * srcStep))[col]); + sum = sum + 0.0625f * (((srcData + idx_row(src_y + 2, last_row) * srcStep))[col]); + + smem[2 + get_local_id(0)] = sum; + + if (get_local_id(0) < 2) + { + const int left_x = x - 2; + + col = idx_col(left_x, last_col); + + sum = 0.0625f * (((srcData + idx_row(src_y - 2, last_row) * srcStep))[col]); + sum = sum + 0.25f * (((srcData + idx_row(src_y - 1, last_row) * srcStep))[col]); + sum = sum + 0.375f * (((srcData + idx_row(src_y , last_row) * srcStep))[col]); + sum = sum + 0.25f * (((srcData + idx_row(src_y + 1, last_row) * srcStep))[col]); + sum = sum + 0.0625f * (((srcData + idx_row(src_y + 2, last_row) * srcStep))[col]); + + smem[get_local_id(0)] = sum; + } + + if (get_local_id(0) > 253) + { + const int right_x = x + 2; + + col = idx_col(right_x, last_col); + + sum = 0.0625f * (((srcData + idx_row(src_y - 2, last_row) * srcStep))[col]); + sum = sum + 0.25f * (((srcData + idx_row(src_y - 1, last_row) * srcStep))[col]); + sum = sum + 0.375f * (((srcData + idx_row(src_y , last_row) * srcStep))[col]); + sum = sum + 0.25f * (((srcData + idx_row(src_y + 1, last_row) * srcStep))[col]); + sum = sum + 0.0625f * (((srcData + idx_row(src_y + 2, last_row) * srcStep))[col]); + + smem[4 + get_local_id(0)] = sum; + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (get_local_id(0) < 128) + { + const int tid2 = get_local_id(0) * 2; + + sum = 0.0625f * smem[2 + tid2 - 2]; + sum = sum + 0.25f * smem[2 + tid2 - 1]; + sum = sum + 0.375f * smem[2 + tid2 ]; + sum = sum + 0.25f * smem[2 + tid2 + 1]; + sum = sum + 0.0625f * smem[2 + tid2 + 2]; + + const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2; + + if (dst_x < dstCols) + dst[y * dstStep + dst_x] = convert_uchar_sat_rte(sum); + } +} + +/////////////////////////////////////////////////////////////////////// +////////////////////////// CV_8UC4 /////////////////////////////////// +/////////////////////////////////////////////////////////////////////// + +__kernel void pyrDown_C4_D0(__global uchar4 * srcData, int srcStep, int srcRows, int srcCols, __global uchar4 *dst, int dstStep, int dstCols) +{ + const int x = get_global_id(0); + const int y = get_group_id(1); + + __local float4 smem[256 + 4]; + + float4 sum; + + const int src_y = 2*y; + const int last_row = srcRows - 1; + const int last_col = srcCols - 1; + + float4 co1 = 0.375f; + float4 co2 = 0.25f; + float4 co3 = 0.0625f; + + if (src_y >= 2 && src_y < srcRows - 2 && x >= 2 && x < srcCols - 2) + { + sum = co3 * convert_float4((((srcData + (src_y - 2) * srcStep / 4))[x])); + sum = sum + co2 * convert_float4((((srcData + (src_y - 1) * srcStep / 4))[x])); + sum = sum + co1 * convert_float4((((srcData + (src_y ) * srcStep / 4))[x])); + sum = sum + co2 * convert_float4((((srcData + (src_y + 1) * srcStep / 4))[x])); + sum = sum + co3 * convert_float4((((srcData + (src_y + 2) * srcStep / 4))[x])); + + smem[2 + get_local_id(0)] = sum; + + if (get_local_id(0) < 2) + { + const int left_x = x - 2; + + sum = co3 * convert_float4((((srcData + (src_y - 2) * srcStep / 4))[left_x])); + sum = sum + co2 * convert_float4((((srcData + (src_y - 1) * srcStep / 4))[left_x])); + sum = sum + co1 * convert_float4((((srcData + (src_y ) * srcStep / 4))[left_x])); + sum = sum + co2 * convert_float4((((srcData + (src_y + 1) * srcStep / 4))[left_x])); + sum = sum + co3 * convert_float4((((srcData + (src_y + 2) * srcStep / 4))[left_x])); + + smem[get_local_id(0)] = sum; + } + + if (get_local_id(0) > 253) + { + const int right_x = x + 2; + + sum = co3 * convert_float4((((srcData + (src_y - 2) * srcStep / 4))[right_x])); + sum = sum + co2 * convert_float4((((srcData + (src_y - 1) * srcStep / 4))[right_x])); + sum = sum + co1 * convert_float4((((srcData + (src_y ) * srcStep / 4))[right_x])); + sum = sum + co2 * convert_float4((((srcData + (src_y + 1) * srcStep / 4))[right_x])); + sum = sum + co3 * convert_float4((((srcData + (src_y + 2) * srcStep / 4))[right_x])); + + smem[4 + get_local_id(0)] = sum; + } + } + else + { + int col = idx_col(x, last_col); + + sum = co3 * convert_float4((((srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col])); + sum = sum + co2 * convert_float4((((srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col])); + sum = sum + co1 * convert_float4((((srcData + idx_row(src_y , last_row) * srcStep / 4))[col])); + sum = sum + co2 * convert_float4((((srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col])); + sum = sum + co3 * convert_float4((((srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col])); + + smem[2 + get_local_id(0)] = sum; + + if (get_local_id(0) < 2) + { + const int left_x = x - 2; + + col = idx_col(left_x, last_col); + + sum = co3 * convert_float4((((srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col])); + sum = sum + co2 * convert_float4((((srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col])); + sum = sum + co1 * convert_float4((((srcData + idx_row(src_y , last_row) * srcStep / 4))[col])); + sum = sum + co2 * convert_float4((((srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col])); + sum = sum + co3 * convert_float4((((srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col])); + + smem[get_local_id(0)] = sum; + } + + if (get_local_id(0) > 253) + { + const int right_x = x + 2; + + col = idx_col(right_x, last_col); + + sum = co3 * convert_float4((((srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col])); + sum = sum + co2 * convert_float4((((srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col])); + sum = sum + co1 * convert_float4((((srcData + idx_row(src_y , last_row) * srcStep / 4))[col])); + sum = sum + co2 * convert_float4((((srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col])); + sum = sum + co3 * convert_float4((((srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col])); + + smem[4 + get_local_id(0)] = sum; + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (get_local_id(0) < 128) + { + const int tid2 = get_local_id(0) * 2; + + sum = co3 * smem[2 + tid2 - 2]; + sum = sum + co2 * smem[2 + tid2 - 1]; + sum = sum + co1 * smem[2 + tid2 ]; + sum = sum + co2 * smem[2 + tid2 + 1]; + sum = sum + co3 * smem[2 + tid2 + 2]; + + const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2; + + if (dst_x < dstCols) + dst[y * dstStep / 4 + dst_x] = convert_uchar4_sat_rte(sum); + } +} + +/////////////////////////////////////////////////////////////////////// +////////////////////////// CV_16UC1 ////////////////////////////////// +/////////////////////////////////////////////////////////////////////// + +__kernel void pyrDown_C1_D2(__global ushort * srcData, int srcStep, int srcRows, int srcCols, __global ushort *dst, int dstStep, int dstCols) +{ + const int x = get_global_id(0); + const int y = get_group_id(1); + + __local float smem[256 + 4]; + + float sum; + + const int src_y = 2*y; + const int last_row = srcRows - 1; + const int last_col = srcCols - 1; + + if (src_y >= 2 && src_y < srcRows - 2 && x >= 2 && x < srcCols - 2) + { + sum = 0.0625f * ((__global ushort*)((__global char*)srcData + (src_y - 2) * srcStep))[x]; + sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + (src_y - 1) * srcStep))[x]; + sum = sum + 0.375f * ((__global ushort*)((__global char*)srcData + (src_y ) * srcStep))[x]; + sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + (src_y + 1) * srcStep))[x]; + sum = sum + 0.0625f * ((__global ushort*)((__global char*)srcData + (src_y + 2) * srcStep))[x]; + + smem[2 + get_local_id(0)] = sum; + + if (get_local_id(0) < 2) + { + const int left_x = x - 2; + + sum = 0.0625f * ((__global ushort*)((__global char*)srcData + (src_y - 2) * srcStep))[left_x]; + sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + (src_y - 1) * srcStep))[left_x]; + sum = sum + 0.375f * ((__global ushort*)((__global char*)srcData + (src_y ) * srcStep))[left_x]; + sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + (src_y + 1) * srcStep))[left_x]; + sum = sum + 0.0625f * ((__global ushort*)((__global char*)srcData + (src_y + 2) * srcStep))[left_x]; + + smem[get_local_id(0)] = sum; + } + + if (get_local_id(0) > 253) + { + const int right_x = x + 2; + + sum = 0.0625f * ((__global ushort*)((__global char*)srcData + (src_y - 2) * srcStep))[right_x]; + sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + (src_y - 1) * srcStep))[right_x]; + sum = sum + 0.375f * ((__global ushort*)((__global char*)srcData + (src_y ) * srcStep))[right_x]; + sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + (src_y + 1) * srcStep))[right_x]; + sum = sum + 0.0625f * ((__global ushort*)((__global char*)srcData + (src_y + 2) * srcStep))[right_x]; + + smem[4 + get_local_id(0)] = sum; + } + } + else + { + int col = idx_col(x, last_col); + + sum = 0.0625f * ((__global ushort*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[col]; + sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[col]; + sum = sum + 0.375f * ((__global ushort*)((__global char*)srcData + idx_row(src_y , last_row) * srcStep))[col]; + sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[col]; + sum = sum + 0.0625f * ((__global ushort*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[col]; + + smem[2 + get_local_id(0)] = sum; + + if (get_local_id(0) < 2) + { + const int left_x = x - 2; + + col = idx_col(left_x, last_col); + + sum = 0.0625f * ((__global ushort*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[col]; + sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[col]; + sum = sum + 0.375f * ((__global ushort*)((__global char*)srcData + idx_row(src_y , last_row) * srcStep))[col]; + sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[col]; + sum = sum + 0.0625f * ((__global ushort*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[col]; + + smem[get_local_id(0)] = sum; + } + + if (get_local_id(0) > 253) + { + const int right_x = x + 2; + + col = idx_col(right_x, last_col); + + sum = 0.0625f * ((__global ushort*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[col]; + sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[col]; + sum = sum + 0.375f * ((__global ushort*)((__global char*)srcData + idx_row(src_y , last_row) * srcStep))[col]; + sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[col]; + sum = sum + 0.0625f * ((__global ushort*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[col]; + + smem[4 + get_local_id(0)] = sum; + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (get_local_id(0) < 128) + { + const int tid2 = get_local_id(0) * 2; + + sum = 0.0625f * smem[2 + tid2 - 2]; + sum = sum + 0.25f * smem[2 + tid2 - 1]; + sum = sum + 0.375f * smem[2 + tid2 ]; + sum = sum + 0.25f * smem[2 + tid2 + 1]; + sum = sum + 0.0625f * smem[2 + tid2 + 2]; + + const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2; + + if (dst_x < dstCols) + dst[y * dstStep / 2 + dst_x] = convert_ushort_sat_rte(sum); + } +} + +/////////////////////////////////////////////////////////////////////// +////////////////////////// CV_16UC4 ////////////////////////////////// +/////////////////////////////////////////////////////////////////////// + +__kernel void pyrDown_C4_D2(__global ushort4 * srcData, int srcStep, int srcRows, int srcCols, __global ushort4 *dst, int dstStep, int dstCols) +{ + const int x = get_global_id(0); + const int y = get_group_id(1); + + __local float4 smem[256 + 4]; + + float4 sum; + + const int src_y = 2*y; + const int last_row = srcRows - 1; + const int last_col = srcCols - 1; + + float4 co1 = 0.375f; + float4 co2 = 0.25f; + float4 co3 = 0.0625f; + + if (src_y >= 2 && src_y < srcRows - 2 && x >= 2 && x < srcCols - 2) + { + sum = co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[x]); + sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[x]); + sum = sum + co1 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y ) * srcStep / 4))[x]); + sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[x]); + sum = sum + co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[x]); + + smem[2 + get_local_id(0)] = sum; + + if (get_local_id(0) < 2) + { + const int left_x = x - 2; + + sum = co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[left_x]); + sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[left_x]); + sum = sum + co1 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y ) * srcStep / 4))[left_x]); + sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[left_x]); + sum = sum + co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[left_x]); + + smem[get_local_id(0)] = sum; + } + + if (get_local_id(0) > 253) + { + const int right_x = x + 2; + + sum = co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[right_x]); + sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[right_x]); + sum = sum + co1 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y ) * srcStep / 4))[right_x]); + sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[right_x]); + sum = sum + co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[right_x]); + + smem[4 + get_local_id(0)] = sum; + } + } + else + { + int col = idx_col(x, last_col); + + sum = co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]); + sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]); + sum = sum + co1 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[col]); + sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]); + sum = sum + co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]); + + smem[2 + get_local_id(0)] = sum; + + if (get_local_id(0) < 2) + { + const int left_x = x - 2; + + col = idx_col(left_x, last_col); + + sum = co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]); + sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]); + sum = sum + co1 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[col]); + sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]); + sum = sum + co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]); + + smem[get_local_id(0)] = sum; + } + + if (get_local_id(0) > 253) + { + const int right_x = x + 2; + + col = idx_col(right_x, last_col); + + sum = co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]); + sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]); + sum = sum + co1 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[col]); + sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]); + sum = sum + co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]); + + smem[4 + get_local_id(0)] = sum; + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (get_local_id(0) < 128) + { + const int tid2 = get_local_id(0) * 2; + + sum = co3 * smem[2 + tid2 - 2]; + sum = sum + co2 * smem[2 + tid2 - 1]; + sum = sum + co1 * smem[2 + tid2 ]; + sum = sum + co2 * smem[2 + tid2 + 1]; + sum = sum + co3 * smem[2 + tid2 + 2]; + + const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2; + + if (dst_x < dstCols) + dst[y * dstStep / 8 + dst_x] = convert_ushort4_sat_rte(sum); + } +} + +/////////////////////////////////////////////////////////////////////// +////////////////////////// CV_16SC1 ////////////////////////////////// +/////////////////////////////////////////////////////////////////////// + +__kernel void pyrDown_C1_D3(__global short * srcData, int srcStep, int srcRows, int srcCols, __global short *dst, int dstStep, int dstCols) +{ + const int x = get_global_id(0); + const int y = get_group_id(1); + + __local float smem[256 + 4]; + + float sum; + + const int src_y = 2*y; + const int last_row = srcRows - 1; + const int last_col = srcCols - 1; + + if (src_y >= 2 && src_y < srcRows - 2 && x >= 2 && x < srcCols - 2) + { + sum = 0.0625f * ((__global short*)((__global char*)srcData + (src_y - 2) * srcStep))[x]; + sum = sum + 0.25f * ((__global short*)((__global char*)srcData + (src_y - 1) * srcStep))[x]; + sum = sum + 0.375f * ((__global short*)((__global char*)srcData + (src_y ) * srcStep))[x]; + sum = sum + 0.25f * ((__global short*)((__global char*)srcData + (src_y + 1) * srcStep))[x]; + sum = sum + 0.0625f * ((__global short*)((__global char*)srcData + (src_y + 2) * srcStep))[x]; + + smem[2 + get_local_id(0)] = sum; + + if (get_local_id(0) < 2) + { + const int left_x = x - 2; + + sum = 0.0625f * ((__global short*)((__global char*)srcData + (src_y - 2) * srcStep))[left_x]; + sum = sum + 0.25f * ((__global short*)((__global char*)srcData + (src_y - 1) * srcStep))[left_x]; + sum = sum + 0.375f * ((__global short*)((__global char*)srcData + (src_y ) * srcStep))[left_x]; + sum = sum + 0.25f * ((__global short*)((__global char*)srcData + (src_y + 1) * srcStep))[left_x]; + sum = sum + 0.0625f * ((__global short*)((__global char*)srcData + (src_y + 2) * srcStep))[left_x]; + + smem[get_local_id(0)] = sum; + } + + if (get_local_id(0) > 253) + { + const int right_x = x + 2; + + sum = 0.0625f * ((__global short*)((__global char*)srcData + (src_y - 2) * srcStep))[right_x]; + sum = sum + 0.25f * ((__global short*)((__global char*)srcData + (src_y - 1) * srcStep))[right_x]; + sum = sum + 0.375f * ((__global short*)((__global char*)srcData + (src_y ) * srcStep))[right_x]; + sum = sum + 0.25f * ((__global short*)((__global char*)srcData + (src_y + 1) * srcStep))[right_x]; + sum = sum + 0.0625f * ((__global short*)((__global char*)srcData + (src_y + 2) * srcStep))[right_x]; + + smem[4 + get_local_id(0)] = sum; + } + } + else + { + int col = idx_col(x, last_col); + + sum = 0.0625f * ((__global short*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[col]; + sum = sum + 0.25f * ((__global short*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[col]; + sum = sum + 0.375f * ((__global short*)((__global char*)srcData + idx_row(src_y , last_row) * srcStep))[col]; + sum = sum + 0.25f * ((__global short*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[col]; + sum = sum + 0.0625f * ((__global short*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[col]; + + smem[2 + get_local_id(0)] = sum; + + if (get_local_id(0) < 2) + { + const int left_x = x - 2; + + col = idx_col(left_x, last_col); + + sum = 0.0625f * ((__global short*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[col]; + sum = sum + 0.25f * ((__global short*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[col]; + sum = sum + 0.375f * ((__global short*)((__global char*)srcData + idx_row(src_y , last_row) * srcStep))[col]; + sum = sum + 0.25f * ((__global short*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[col]; + sum = sum + 0.0625f * ((__global short*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[col]; + + smem[get_local_id(0)] = sum; + } + + if (get_local_id(0) > 253) + { + const int right_x = x + 2; + + col = idx_col(right_x, last_col); + + sum = 0.0625f * ((__global short*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[col]; + sum = sum + 0.25f * ((__global short*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[col]; + sum = sum + 0.375f * ((__global short*)((__global char*)srcData + idx_row(src_y , last_row) * srcStep))[col]; + sum = sum + 0.25f * ((__global short*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[col]; + sum = sum + 0.0625f * ((__global short*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[col]; + + smem[4 + get_local_id(0)] = sum; + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (get_local_id(0) < 128) + { + const int tid2 = get_local_id(0) * 2; + + sum = 0.0625f * smem[2 + tid2 - 2]; + sum = sum + 0.25f * smem[2 + tid2 - 1]; + sum = sum + 0.375f * smem[2 + tid2 ]; + sum = sum + 0.25f * smem[2 + tid2 + 1]; + sum = sum + 0.0625f * smem[2 + tid2 + 2]; + + const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2; + + if (dst_x < dstCols) + dst[y * dstStep / 2 + dst_x] = convert_short_sat_rte(sum); + } +} + +/////////////////////////////////////////////////////////////////////// +////////////////////////// CV_16SC4 ////////////////////////////////// +/////////////////////////////////////////////////////////////////////// + +__kernel void pyrDown_C4_D3(__global short4 * srcData, int srcStep, int srcRows, int srcCols, __global short4 *dst, int dstStep, int dstCols) +{ + const int x = get_global_id(0); + const int y = get_group_id(1); + + __local float4 smem[256 + 4]; + + float4 sum; + + const int src_y = 2*y; + const int last_row = srcRows - 1; + const int last_col = srcCols - 1; + + float4 co1 = 0.375f; + float4 co2 = 0.25f; + float4 co3 = 0.0625f; + + if (src_y >= 2 && src_y < srcRows - 2 && x >= 2 && x < srcCols - 2) + { + sum = co3 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[x]); + sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[x]); + sum = sum + co1 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y ) * srcStep / 4))[x]); + sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[x]); + sum = sum + co3 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[x]); + + smem[2 + get_local_id(0)] = sum; + + if (get_local_id(0) < 2) + { + const int left_x = x - 2; + + sum = co3 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[left_x]); + sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[left_x]); + sum = sum + co1 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y ) * srcStep / 4))[left_x]); + sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[left_x]); + sum = sum + co3 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[left_x]); + + smem[get_local_id(0)] = sum; + } + + if (get_local_id(0) > 253) + { + const int right_x = x + 2; + + sum = co3 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[right_x]); + sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[right_x]); + sum = sum + co1 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y ) * srcStep / 4))[right_x]); + sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[right_x]); + sum = sum + co3 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[right_x]); + + smem[4 + get_local_id(0)] = sum; + } + } + else + { + int col = idx_col(x, last_col); + + sum = co3 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]); + sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]); + sum = sum + co1 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[col]); + sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]); + sum = sum + co3 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]); + + smem[2 + get_local_id(0)] = sum; + + if (get_local_id(0) < 2) + { + const int left_x = x - 2; + + col = idx_col(left_x, last_col); + + sum = co3 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]); + sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]); + sum = sum + co1 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[col]); + sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]); + sum = sum + co3 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]); + + smem[get_local_id(0)] = sum; + } + + if (get_local_id(0) > 253) + { + const int right_x = x + 2; + + col = idx_col(right_x, last_col); + + sum = co3 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]); + sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]); + sum = sum + co1 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[col]); + sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]); + sum = sum + co3 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]); + + smem[4 + get_local_id(0)] = sum; + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (get_local_id(0) < 128) + { + const int tid2 = get_local_id(0) * 2; + + sum = co3 * smem[2 + tid2 - 2]; + sum = sum + co2 * smem[2 + tid2 - 1]; + sum = sum + co1 * smem[2 + tid2 ]; + sum = sum + co2 * smem[2 + tid2 + 1]; + sum = sum + co3 * smem[2 + tid2 + 2]; + + const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2; + + if (dst_x < dstCols) + dst[y * dstStep / 8 + dst_x] = convert_short4_sat_rte(sum); + } +} + +/////////////////////////////////////////////////////////////////////// +////////////////////////// CV_32FC1 ////////////////////////////////// +/////////////////////////////////////////////////////////////////////// + +__kernel void pyrDown_C1_D5(__global float * srcData, int srcStep, int srcRows, int srcCols, __global float *dst, int dstStep, int dstCols) +{ + const int x = get_global_id(0); + const int y = get_group_id(1); + + __local float smem[256 + 4]; + + float sum; + + const int src_y = 2*y; + const int last_row = srcRows - 1; + const int last_col = srcCols - 1; + + if (src_y >= 2 && src_y < srcRows - 2 && x >= 2 && x < srcCols - 2) + { + sum = 0.0625f * ((__global float*)((__global char*)srcData + (src_y - 2) * srcStep))[x]; + sum = sum + 0.25f * ((__global float*)((__global char*)srcData + (src_y - 1) * srcStep))[x]; + sum = sum + 0.375f * ((__global float*)((__global char*)srcData + (src_y ) * srcStep))[x]; + sum = sum + 0.25f * ((__global float*)((__global char*)srcData + (src_y + 1) * srcStep))[x]; + sum = sum + 0.0625f * ((__global float*)((__global char*)srcData + (src_y + 2) * srcStep))[x]; + + smem[2 + get_local_id(0)] = sum; + + if (get_local_id(0) < 2) + { + const int left_x = x - 2; + + sum = 0.0625f * ((__global float*)((__global char*)srcData + (src_y - 2) * srcStep))[left_x]; + sum = sum + 0.25f * ((__global float*)((__global char*)srcData + (src_y - 1) * srcStep))[left_x]; + sum = sum + 0.375f * ((__global float*)((__global char*)srcData + (src_y ) * srcStep))[left_x]; + sum = sum + 0.25f * ((__global float*)((__global char*)srcData + (src_y + 1) * srcStep))[left_x]; + sum = sum + 0.0625f * ((__global float*)((__global char*)srcData + (src_y + 2) * srcStep))[left_x]; + + smem[get_local_id(0)] = sum; + } + + if (get_local_id(0) > 253) + { + const int right_x = x + 2; + + sum = 0.0625f * ((__global float*)((__global char*)srcData + (src_y - 2) * srcStep))[right_x]; + sum = sum + 0.25f * ((__global float*)((__global char*)srcData + (src_y - 1) * srcStep))[right_x]; + sum = sum + 0.375f * ((__global float*)((__global char*)srcData + (src_y ) * srcStep))[right_x]; + sum = sum + 0.25f * ((__global float*)((__global char*)srcData + (src_y + 1) * srcStep))[right_x]; + sum = sum + 0.0625f * ((__global float*)((__global char*)srcData + (src_y + 2) * srcStep))[right_x]; + + smem[4 + get_local_id(0)] = sum; + } + } + else + { + int col = idx_col(x, last_col); + + sum = 0.0625f * ((__global float*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[col]; + sum = sum + 0.25f * ((__global float*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[col]; + sum = sum + 0.375f * ((__global float*)((__global char*)srcData + idx_row(src_y , last_row) * srcStep))[col]; + sum = sum + 0.25f * ((__global float*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[col]; + sum = sum + 0.0625f * ((__global float*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[col]; + + smem[2 + get_local_id(0)] = sum; + + if (get_local_id(0) < 2) + { + const int left_x = x - 2; + + col = idx_col(left_x, last_col); + + sum = 0.0625f * ((__global float*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[col]; + sum = sum + 0.25f * ((__global float*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[col]; + sum = sum + 0.375f * ((__global float*)((__global char*)srcData + idx_row(src_y , last_row) * srcStep))[col]; + sum = sum + 0.25f * ((__global float*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[col]; + sum = sum + 0.0625f * ((__global float*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[col]; + + smem[get_local_id(0)] = sum; + } + + if (get_local_id(0) > 253) + { + const int right_x = x + 2; + + col = idx_col(right_x, last_col); + + sum = 0.0625f * ((__global float*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[col]; + sum = sum + 0.25f * ((__global float*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[col]; + sum = sum + 0.375f * ((__global float*)((__global char*)srcData + idx_row(src_y , last_row) * srcStep))[col]; + sum = sum + 0.25f * ((__global float*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[col]; + sum = sum + 0.0625f * ((__global float*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[col]; + + smem[4 + get_local_id(0)] = sum; + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (get_local_id(0) < 128) + { + const int tid2 = get_local_id(0) * 2; + + sum = 0.0625f * smem[2 + tid2 - 2]; + sum = sum + 0.25f * smem[2 + tid2 - 1]; + sum = sum + 0.375f * smem[2 + tid2 ]; + sum = sum + 0.25f * smem[2 + tid2 + 1]; + sum = sum + 0.0625f * smem[2 + tid2 + 2]; + + const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2; + + if (dst_x < dstCols) + dst[y * dstStep / 4 + dst_x] = sum; + } +} + +/////////////////////////////////////////////////////////////////////// +////////////////////////// CV_32FC4 ////////////////////////////////// +/////////////////////////////////////////////////////////////////////// + +__kernel void pyrDown_C4_D5(__global float4 * srcData, int srcStep, int srcRows, int srcCols, __global float4 *dst, int dstStep, int dstCols) +{ + const int x = get_global_id(0); + const int y = get_group_id(1); + + __local float4 smem[256 + 4]; + + float4 sum; + + const int src_y = 2*y; + const int last_row = srcRows - 1; + const int last_col = srcCols - 1; + + float4 co1 = 0.375f; + float4 co2 = 0.25f; + float4 co3 = 0.0625f; + + if (src_y >= 2 && src_y < srcRows - 2 && x >= 2 && x < srcCols - 2) + { + sum = co3 * ((__global float4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[x]; + sum = sum + co2 * ((__global float4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[x]; + sum = sum + co1 * ((__global float4*)((__global char4*)srcData + (src_y ) * srcStep / 4))[x]; + sum = sum + co2 * ((__global float4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[x]; + sum = sum + co3 * ((__global float4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[x]; + + smem[2 + get_local_id(0)] = sum; + + if (get_local_id(0) < 2) + { + const int left_x = x - 2; + + sum = co3 * ((__global float4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[left_x]; + sum = sum + co2 * ((__global float4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[left_x]; + sum = sum + co1 * ((__global float4*)((__global char4*)srcData + (src_y ) * srcStep / 4))[left_x]; + sum = sum + co2 * ((__global float4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[left_x]; + sum = sum + co3 * ((__global float4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[left_x]; + + smem[get_local_id(0)] = sum; + } + + if (get_local_id(0) > 253) + { + const int right_x = x + 2; + + sum = co3 * ((__global float4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[right_x]; + sum = sum + co2 * ((__global float4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[right_x]; + sum = sum + co1 * ((__global float4*)((__global char4*)srcData + (src_y ) * srcStep / 4))[right_x]; + sum = sum + co2 * ((__global float4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[right_x]; + sum = sum + co3 * ((__global float4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[right_x]; + + smem[4 + get_local_id(0)] = sum; + } + } + else + { + int col = idx_col(x, last_col); + + sum = co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]; + sum = sum + co2 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]; + sum = sum + co1 * ((__global float4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[col]; + sum = sum + co2 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]; + sum = sum + co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]; + + smem[2 + get_local_id(0)] = sum; + + if (get_local_id(0) < 2) + { + const int left_x = x - 2; + + col = idx_col(left_x, last_col); + + sum = co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]; + sum = sum + co2 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]; + sum = sum + co1 * ((__global float4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[col]; + sum = sum + co2 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]; + sum = sum + co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]; + + smem[get_local_id(0)] = sum; + } + + if (get_local_id(0) > 253) + { + const int right_x = x + 2; + + col = idx_col(right_x, last_col); + + sum = co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]; + sum = sum + co2 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]; + sum = sum + co1 * ((__global float4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[col]; + sum = sum + co2 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]; + sum = sum + co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]; + + smem[4 + get_local_id(0)] = sum; + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (get_local_id(0) < 128) + { + const int tid2 = get_local_id(0) * 2; + + sum = co3 * smem[2 + tid2 - 2]; + sum = sum + co2 * smem[2 + tid2 - 1]; + sum = sum + co1 * smem[2 + tid2 ]; + sum = sum + co2 * smem[2 + tid2 + 1]; + sum = sum + co3 * smem[2 + tid2 + 2]; + + const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2; + + if (dst_x < dstCols) + dst[y * dstStep / 16 + dst_x] = sum; + } +} diff --git a/modules/imgproc/src/opencl/remap.cl b/modules/imgproc/src/opencl/remap.cl new file mode 100644 index 0000000000..d545497f0f --- /dev/null +++ b/modules/imgproc/src/opencl/remap.cl @@ -0,0 +1,323 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Wu Zailong, bullet@yeah.net +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors as is and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if defined (DOUBLE_SUPPORT) +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64:enable +#elif defined (cl_amd_fp64) +#pragma OPENCL EXTENSION cl_amd_fp64:enable +#endif +#endif + +#ifdef INTER_NEAREST +#define convertToWT +#endif + +#ifdef BORDER_CONSTANT +#define EXTRAPOLATE(v2, v) v = scalar; +#elif defined BORDER_REPLICATE +#define EXTRAPOLATE(v2, v) \ + { \ + v2 = max(min(v2, (int2)(src_cols - 1, src_rows - 1)), zero); \ + v = convertToWT(src[mad24(v2.y, src_step, v2.x + src_offset)]); \ + } +#elif defined BORDER_WRAP +#define EXTRAPOLATE(v2, v) \ + { \ + if (v2.x < 0) \ + v2.x -= ((v2.x - src_cols + 1) / src_cols) * src_cols; \ + if (v2.x >= src_cols) \ + v2.x %= src_cols; \ + \ + if (v2.y < 0) \ + v2.y -= ((v2.y - src_rows + 1) / src_rows) * src_rows; \ + if( v2.y >= src_rows ) \ + v2.y %= src_rows; \ + v = convertToWT(src[mad24(v2.y, src_step, v2.x + src_offset)]); \ + } +#elif defined(BORDER_REFLECT) || defined(BORDER_REFLECT_101) +#ifdef BORDER_REFLECT +#define DELTA int delta = 0 +#else +#define DELTA int delta = 1 +#endif +#define EXTRAPOLATE(v2, v) \ + { \ + DELTA; \ + if (src_cols == 1) \ + v2.x = 0; \ + else \ + do \ + { \ + if( v2.x < 0 ) \ + v2.x = -v2.x - 1 + delta; \ + else \ + v2.x = src_cols - 1 - (v2.x - src_cols) - delta; \ + } \ + while (v2.x >= src_cols || v2.x < 0); \ + \ + if (src_rows == 1) \ + v2.y = 0; \ + else \ + do \ + { \ + if( v2.y < 0 ) \ + v2.y = -v2.y - 1 + delta; \ + else \ + v2.y = src_rows - 1 - (v2.y - src_rows) - delta; \ + } \ + while (v2.y >= src_rows || v2.y < 0); \ + v = convertToWT(src[mad24(v2.y, src_step, v2.x + src_offset)]); \ + } +#else +#error No extrapolation method +#endif + +#define NEED_EXTRAPOLATION(gx, gy) (gx >= src_cols || gy >= src_rows || gx < 0 || gy < 0) + +#ifdef INTER_NEAREST + +__kernel void remap_2_32FC1(__global const T * restrict src, __global T * dst, + __global float * map1, __global float * map2, + int src_offset, int dst_offset, int map1_offset, int map2_offset, + int src_step, int dst_step, int map1_step, int map2_step, + int src_cols, int src_rows, int dst_cols, int dst_rows, T scalar) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + if (x < dst_cols && y < dst_rows) + { + int dstIdx = mad24(y, dst_step, x + dst_offset); + int map1Idx = mad24(y, map1_step, x + map1_offset); + int map2Idx = mad24(y, map2_step, x + map2_offset); + + int gx = convert_int_sat_rte(map1[map1Idx]); + int gy = convert_int_sat_rte(map2[map2Idx]); + + if (NEED_EXTRAPOLATION(gx, gy)) + { + int2 gxy = (int2)(gx, gy), zero = (int2)(0); + EXTRAPOLATE(gxy, dst[dstIdx]); + } + else + { + int srcIdx = mad24(gy, src_step, gx + src_offset); + dst[dstIdx] = src[srcIdx]; + } + } +} + +__kernel void remap_32FC2(__global const T * restrict src, __global T * dst, __global float2 * map1, + int src_offset, int dst_offset, int map1_offset, + int src_step, int dst_step, int map1_step, + int src_cols, int src_rows, int dst_cols, int dst_rows, T scalar) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + if (x < dst_cols && y < dst_rows) + { + int dstIdx = mad24(y, dst_step, x + dst_offset); + int map1Idx = mad24(y, map1_step, x + map1_offset); + + int2 gxy = convert_int2_sat_rte(map1[map1Idx]); + int gx = gxy.x, gy = gxy.y; + + if (NEED_EXTRAPOLATION(gx, gy)) + { + int2 zero = (int2)(0); + EXTRAPOLATE(gxy, dst[dstIdx]); + } + else + { + int srcIdx = mad24(gy, src_step, gx + src_offset); + dst[dstIdx] = src[srcIdx]; + } + } +} + +__kernel void remap_16SC2(__global const T * restrict src, __global T * dst, __global short2 * map1, + int src_offset, int dst_offset, int map1_offset, + int src_step, int dst_step, int map1_step, + int src_cols, int src_rows, int dst_cols, int dst_rows, T scalar) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + if (x < dst_cols && y < dst_rows) + { + int dstIdx = mad24(y, dst_step, x + dst_offset); + int map1Idx = mad24(y, map1_step, x + map1_offset); + + int2 gxy = convert_int2(map1[map1Idx]); + int gx = gxy.x, gy = gxy.y; + + if (NEED_EXTRAPOLATION(gx, gy)) + { + int2 zero = (int2)(0); + EXTRAPOLATE(gxy, dst[dstIdx]); + } + else + { + int srcIdx = mad24(gy, src_step, gx + src_offset); + dst[dstIdx] = src[srcIdx]; + } + } +} + +#elif INTER_LINEAR + +__kernel void remap_2_32FC1(__global T const * restrict src, __global T * dst, + __global float * map1, __global float * map2, + int src_offset, int dst_offset, int map1_offset, int map2_offset, + int src_step, int dst_step, int map1_step, int map2_step, + int src_cols, int src_rows, int dst_cols, int dst_rows, T nVal) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + if (x < dst_cols && y < dst_rows) + { + int dstIdx = mad24(y, dst_step, x + dst_offset); + int map1Idx = mad24(y, map1_step, x + map1_offset); + int map2Idx = mad24(y, map2_step, x + map2_offset); + + float2 map_data = (float2)(map1[map1Idx], map2[map2Idx]); + + int2 map_dataA = convert_int2_sat_rtn(map_data); + int2 map_dataB = (int2)(map_dataA.x + 1, map_dataA.y); + int2 map_dataC = (int2)(map_dataA.x, map_dataA.y + 1); + int2 map_dataD = (int2)(map_dataA.x + 1, map_dataA.y +1); + int2 zero = (int2)(0); + + float2 _u = map_data - convert_float2(map_dataA); + WT2 u = convertToWT2(convert_int2_rte(convertToWT2(_u) * (WT2)32)) / (WT2)32; + WT scalar = convertToWT(nVal); + WT a = scalar, b = scalar, c = scalar, d = scalar; + + if (!NEED_EXTRAPOLATION(map_dataA.x, map_dataA.y)) + a = convertToWT(src[mad24(map_dataA.y, src_step, map_dataA.x + src_offset)]); + else + EXTRAPOLATE(map_dataA, a); + + if (!NEED_EXTRAPOLATION(map_dataB.x, map_dataB.y)) + b = convertToWT(src[mad24(map_dataB.y, src_step, map_dataB.x + src_offset)]); + else + EXTRAPOLATE(map_dataB, b); + + if (!NEED_EXTRAPOLATION(map_dataC.x, map_dataC.y)) + c = convertToWT(src[mad24(map_dataC.y, src_step, map_dataC.x + src_offset)]); + else + EXTRAPOLATE(map_dataC, c); + + if (!NEED_EXTRAPOLATION(map_dataD.x, map_dataD.y)) + d = convertToWT(src[mad24(map_dataD.y, src_step, map_dataD.x + src_offset)]); + else + EXTRAPOLATE(map_dataD, d); + + WT dst_data = a * (WT)(1 - u.x) * (WT)(1 - u.y) + + b * (WT)(u.x) * (WT)(1 - u.y) + + c * (WT)(1 - u.x) * (WT)(u.y) + + d * (WT)(u.x) * (WT)(u.y); + dst[dstIdx] = convertToT(dst_data); + } +} + +__kernel void remap_32FC2(__global T const * restrict src, __global T * dst, + __global float2 * map1, + int src_offset, int dst_offset, int map1_offset, + int src_step, int dst_step, int map1_step, + int src_cols, int src_rows, int dst_cols, int dst_rows, T nVal) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + if (x < dst_cols && y < dst_rows) + { + int dstIdx = mad24(y, dst_step, x + dst_offset); + int map1Idx = mad24(y, map1_step, x + map1_offset); + + float2 map_data = map1[map1Idx]; + int2 map_dataA = convert_int2_sat_rtn(map_data); + int2 map_dataB = (int2)(map_dataA.x + 1, map_dataA.y); + int2 map_dataC = (int2)(map_dataA.x, map_dataA.y + 1); + int2 map_dataD = (int2)(map_dataA.x + 1, map_dataA.y + 1); + int2 zero = (int2)(0); + + float2 _u = map_data - convert_float2(map_dataA); + WT2 u = convertToWT2(convert_int2_rte(convertToWT2(_u) * (WT2)32)) / (WT2)32; + WT scalar = convertToWT(nVal); + WT a = scalar, b = scalar, c = scalar, d = scalar; + + if (!NEED_EXTRAPOLATION(map_dataA.x, map_dataA.y)) + a = convertToWT(src[mad24(map_dataA.y, src_step, map_dataA.x + src_offset)]); + else + EXTRAPOLATE(map_dataA, a); + + if (!NEED_EXTRAPOLATION(map_dataB.x, map_dataB.y)) + b = convertToWT(src[mad24(map_dataB.y, src_step, map_dataB.x + src_offset)]); + else + EXTRAPOLATE(map_dataB, b); + + if (!NEED_EXTRAPOLATION(map_dataC.x, map_dataC.y)) + c = convertToWT(src[mad24(map_dataC.y, src_step, map_dataC.x + src_offset)]); + else + EXTRAPOLATE(map_dataC, c); + + if (!NEED_EXTRAPOLATION(map_dataD.x, map_dataD.y)) + d = convertToWT(src[mad24(map_dataD.y, src_step, map_dataD.x + src_offset)]); + else + EXTRAPOLATE(map_dataD, d); + + WT dst_data = a * (WT)(1 - u.x) * (WT)(1 - u.y) + + b * (WT)(u.x) * (WT)(1 - u.y) + + c * (WT)(1 - u.x) * (WT)(u.y) + + d * (WT)(u.x) * (WT)(u.y); + dst[dstIdx] = convertToT(dst_data); + } +} + +#endif diff --git a/modules/imgproc/src/opencl/resize.cl b/modules/imgproc/src/opencl/resize.cl new file mode 100644 index 0000000000..c53afd7434 --- /dev/null +++ b/modules/imgproc/src/opencl/resize.cl @@ -0,0 +1,152 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Zhang Ying, zhangying913@gmail.com +// Niko Li, newlife20080214@gmail.com +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors as is and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + + +// resize kernel +// Currently, CV_8UC1 CV_8UC4 CV_32FC1 and CV_32FC4are supported. +// We shall support other types later if necessary. + +#if defined DOUBLE_SUPPORT +#pragma OPENCL EXTENSION cl_khr_fp64:enable +#define F double +#else +#define F float +#endif + +#define INTER_RESIZE_COEF_BITS 11 +#define INTER_RESIZE_COEF_SCALE (1 << INTER_RESIZE_COEF_BITS) +#define CAST_BITS (INTER_RESIZE_COEF_BITS << 1) +#define CAST_SCALE (1.0f/(1<=srccols ) x=srccols-1,u=0; + if ( y<0 ) y=0,v=0; + if ( y>=srcrows ) y=srcrows-1,v=0; + + int y_ = INC(y,srcrows); + int x_ = INC(x,srccols); + const PIXTYPE* src = (const PIXTYPE*)(srcptr + mad24(y, srcstep, srcoffset + x*PIXSIZE)); + +#if depth == 0 + u = u * INTER_RESIZE_COEF_SCALE; + v = v * INTER_RESIZE_COEF_SCALE; + + int U = rint(u); + int V = rint(v); + int U1 = rint(INTER_RESIZE_COEF_SCALE - u); + int V1 = rint(INTER_RESIZE_COEF_SCALE - v); + + WORKTYPE data0 = convertToWT(*(const PIXTYPE*)(srcptr + mad24(y, srcstep, srcoffset + x*PIXSIZE))); + WORKTYPE data1 = convertToWT(*(const PIXTYPE*)(srcptr + mad24(y, srcstep, srcoffset + x_*PIXSIZE))); + WORKTYPE data2 = convertToWT(*(const PIXTYPE*)(srcptr + mad24(y_, srcstep, srcoffset + x*PIXSIZE))); + WORKTYPE data3 = convertToWT(*(const PIXTYPE*)(srcptr + mad24(y_, srcstep, srcoffset + x_*PIXSIZE))); + WORKTYPE val = mul24((WORKTYPE)mul24(U1, V1), data0) + mul24((WORKTYPE)mul24(U, V1), data1) + + mul24((WORKTYPE)mul24(U1, V), data2) + mul24((WORKTYPE)mul24(U, V), data3); + + PIXTYPE uval = convertToDT((val + (1<<(CAST_BITS-1)))>>CAST_BITS); +#else + float u1 = 1.f-u; + float v1 = 1.f-v; + WORKTYPE data0 = convertToWT(*(const PIXTYPE*)(srcptr + mad24(y, srcstep, srcoffset + x*PIXSIZE))); + WORKTYPE data1 = convertToWT(*(const PIXTYPE*)(srcptr + mad24(y, srcstep, srcoffset + x_*PIXSIZE))); + WORKTYPE data2 = convertToWT(*(const PIXTYPE*)(srcptr + mad24(y_, srcstep, srcoffset + x*PIXSIZE))); + WORKTYPE data3 = convertToWT(*(const PIXTYPE*)(srcptr + mad24(y_, srcstep, srcoffset + x_*PIXSIZE))); + PIXTYPE uval = u1 * v1 * s_data1 + u * v1 * s_data2 + u1 * v *s_data3 + u * v *s_data4; +#endif + + if(dx < dstcols && dy < dstrows) + { + PIXTYPE* dst = (PIXTYPE*)(dstptr + mad24(dy, dststep, dstoffset + dx*PIXSIZE)); + dst[0] = uval; + } +} + +#elif defined INTER_NEAREST + +__kernel void resizeNN(__global const uchar* srcptr, int srcstep, int srcoffset, + int srcrows, int srccols, + __global uchar* dstptr, int dststep, int dstoffset, + int dstrows, int dstcols, + float ifx, float ify) +{ + int dx = get_global_id(0); + int dy = get_global_id(1); + + if( dx < dstcols && dy < dstrows ) + { + F s1 = dx*ifx; + F s2 = dy*ify; + int sx = min(convert_int_rtz(s1), srccols-1); + int sy = min(convert_int_rtz(s2), srcrows-1); + PIXTYPE* dst = (PIXTYPE*)(dstptr + + mad24(dy, dststep, dstoffset + dx*PIXSIZE)); + const PIXTYPE* src = (const PIXTYPE*)(srcptr + + mad24(sy, srcstep, srcoffset + sx*PIXSIZE)); + dst[0] = src[0]; + } +} + +#endif + diff --git a/modules/imgproc/src/opencl/threshold.cl b/modules/imgproc/src/opencl/threshold.cl new file mode 100644 index 0000000000..8d7c77e1fa --- /dev/null +++ b/modules/imgproc/src/opencl/threshold.cl @@ -0,0 +1,152 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Zhang Ying, zhangying913@gmail.com +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors as is and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if defined (DOUBLE_SUPPORT) +#pragma OPENCL EXTENSION cl_khr_fp64:enable +#endif + +// threshold type: +// enum { THRESH_BINARY=0, THRESH_BINARY_INV=1, THRESH_TRUNC=2, THRESH_TOZERO=3, +// THRESH_TOZERO_INV=4, THRESH_MASK=7, THRESH_OTSU=8 }; + +__kernel void threshold_C1_D0(__global const uchar * restrict src, __global uchar *dst, + int src_offset, int src_step, + int dst_offset, int dst_rows, int dst_cols, int dst_step, + uchar thresh, uchar max_val, int thresh_type + ) +{ + int gx = get_global_id(0); + const int gy = get_global_id(1); + + int offset = (dst_offset & 15); + src_offset -= offset; + + int dstart = (gx << 4) - offset; + if(dstart < dst_cols && gy < dst_rows) + { + uchar16 sdata = vload16(gx, src+src_offset+gy*src_step); + uchar16 ddata; + uchar16 zero = 0; + switch (thresh_type) + { + case 0: + ddata = ((sdata > thresh) ) ? (uchar16)(max_val) : (uchar16)(0); + break; + case 1: + ddata = ((sdata > thresh)) ? zero : (uchar16)(max_val); + break; + case 2: + ddata = ((sdata > thresh)) ? (uchar16)(thresh) : sdata; + break; + case 3: + ddata = ((sdata > thresh)) ? sdata : zero; + break; + case 4: + ddata = ((sdata > thresh)) ? zero : sdata; + break; + default: + ddata = sdata; + } + int16 dpos = (int16)(dstart, dstart+1, dstart+2, dstart+3, dstart+4, dstart+5, dstart+6, dstart+7, dstart+8, + dstart+9, dstart+10, dstart+11, dstart+12, dstart+13, dstart+14, dstart+15); + uchar16 dVal = *(__global uchar16*)(dst+dst_offset+gy*dst_step+dstart); + int16 con = dpos >= 0 && dpos < dst_cols; + ddata = convert_uchar16(con != 0) ? ddata : dVal; + if(dstart < dst_cols) + { + *(__global uchar16*)(dst+dst_offset+gy*dst_step+dstart) = ddata; + } + } +} + + +__kernel void threshold_C1_D5(__global const float * restrict src, __global float *dst, + int src_offset, int src_step, + int dst_offset, int dst_rows, int dst_cols, int dst_step, + float thresh, float max_val, int thresh_type + ) +{ + const int gx = get_global_id(0); + const int gy = get_global_id(1); + + int offset = (dst_offset & 3); + src_offset -= offset; + + int dstart = (gx << 2) - offset; + if(dstart < dst_cols && gy < dst_rows) + { + float4 sdata = vload4(gx, src+src_offset+gy*src_step); + float4 ddata; + float4 zero = 0; + switch (thresh_type) + { + case 0: + ddata = sdata > thresh ? (float4)(max_val) : (float4)(0.f); + break; + case 1: + ddata = sdata > thresh ? zero : (float4)max_val; + break; + case 2: + ddata = sdata > thresh ? (float4)thresh : sdata; + break; + case 3: + ddata = sdata > thresh ? sdata : (float4)(0.f); + break; + case 4: + ddata = sdata > thresh ? (float4)(0.f) : sdata; + break; + default: + ddata = sdata; + } + int4 dpos = (int4)(dstart, dstart+1, dstart+2, dstart+3); + float4 dVal = *(__global float4*)(dst+dst_offset+gy*dst_step+dstart); + int4 con = dpos >= 0 && dpos < dst_cols; + ddata = convert_float4(con) != (float4)(0) ? ddata : dVal; + if(dstart < dst_cols) + { + *(__global float4*)(dst+dst_offset+gy*dst_step+dstart) = ddata; + } + } +} diff --git a/modules/imgproc/src/opencl/warpaffine.cl b/modules/imgproc/src/opencl/warpaffine.cl new file mode 100644 index 0000000000..caafdfb92c --- /dev/null +++ b/modules/imgproc/src/opencl/warpaffine.cl @@ -0,0 +1,761 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Zhang Ying, zhangying913@gmail.com +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors as is and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + + +//warpAffine kernel +//support data types: CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4, and three interpolation methods: NN, Linear, Cubic. + +#if defined (DOUBLE_SUPPORT) +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64:enable +#elif defined (cl_amd_fp64) +#pragma OPENCL EXTENSION cl_amd_fp64:enable +#endif +typedef double F; +typedef double4 F4; +#define convert_F4 convert_double4 +#else +typedef float F; +typedef float4 F4; +#define convert_F4 convert_float4 +#endif + +#define INTER_BITS 5 +#define INTER_TAB_SIZE (1 << INTER_BITS) +#define INTER_SCALE 1.f/INTER_TAB_SIZE +#define AB_BITS max(10, (int)INTER_BITS) +#define AB_SCALE (1 << AB_BITS) +#define INTER_REMAP_COEF_BITS 15 +#define INTER_REMAP_COEF_SCALE (1 << INTER_REMAP_COEF_BITS) + +inline void interpolateCubic( float x, float* coeffs ) +{ + const float A = -0.75f; + + coeffs[0] = ((A*(x + 1.f) - 5.0f*A)*(x + 1.f) + 8.0f*A)*(x + 1.f) - 4.0f*A; + coeffs[1] = ((A + 2.f)*x - (A + 3.f))*x*x + 1.f; + coeffs[2] = ((A + 2.f)*(1.f - x) - (A + 3.f))*(1.f - x)*(1.f - x) + 1.f; + coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2]; +} + + +/**********************************************8UC1********************************************* +***********************************************************************************************/ +__kernel void warpAffineNN_C1_D0(__global uchar const * restrict src, __global uchar * dst, int src_cols, int src_rows, + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) +{ + int dx = get_global_id(0); + int dy = get_global_id(1); + + if( dx < threadCols && dy < dst_rows) + { + dx = (dx<<2) - (dst_offset&3); + + int round_delta = (AB_SCALE>>1); + + int4 X, Y; + int4 sx, sy; + int4 DX = (int4)(dx, dx+1, dx+2, dx+3); + DX = (DX << AB_BITS); + F4 M0DX, M3DX; + M0DX = M[0] * convert_F4(DX); + M3DX = M[3] * convert_F4(DX); + X = convert_int4(rint(M0DX)); + Y = convert_int4(rint(M3DX)); + int tmp1, tmp2; + tmp1 = rint((M[1]*dy + M[2]) * AB_SCALE); + tmp2 = rint((M[4]*dy + M[5]) * AB_SCALE); + + X += tmp1 + round_delta; + Y += tmp2 + round_delta; + + sx = convert_int4(convert_short4(X >> AB_BITS)); + sy = convert_int4(convert_short4(Y >> AB_BITS)); + + __global uchar4 * d = (__global uchar4 *)(dst+dst_offset+dy*dstStep+dx); + uchar4 dval = *d; + DX = (int4)(dx, dx+1, dx+2, dx+3); + int4 dcon = DX >= 0 && DX < dst_cols && dy >= 0 && dy < dst_rows; + int4 scon = sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows; + int4 spos = src_offset + sy * srcStep + sx; + uchar4 sval; + sval.s0 = scon.s0 ? src[spos.s0] : 0; + sval.s1 = scon.s1 ? src[spos.s1] : 0; + sval.s2 = scon.s2 ? src[spos.s2] : 0; + sval.s3 = scon.s3 ? src[spos.s3] : 0; + dval = convert_uchar4(dcon) != (uchar4)(0,0,0,0) ? sval : dval; + *d = dval; + } +} + +__kernel void warpAffineLinear_C1_D0(__global const uchar * restrict src, __global uchar * dst, int src_cols, int src_rows, + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) +{ + int dx = get_global_id(0); + int dy = get_global_id(1); + + + if( dx < threadCols && dy < dst_rows) + { + dx = (dx<<2) - (dst_offset&3); + + int round_delta = ((AB_SCALE >> INTER_BITS) >> 1); + + int4 X, Y; + short4 ax, ay; + int4 sx, sy; + int4 DX = (int4)(dx, dx+1, dx+2, dx+3); + DX = (DX << AB_BITS); + F4 M0DX, M3DX; + M0DX = M[0] * convert_F4(DX); + M3DX = M[3] * convert_F4(DX); + X = convert_int4(rint(M0DX)); + Y = convert_int4(rint(M3DX)); + + int tmp1, tmp2; + tmp1 = rint((M[1]*dy + M[2]) * AB_SCALE); + tmp2 = rint((M[4]*dy + M[5]) * AB_SCALE); + + X += tmp1 + round_delta; + Y += tmp2 + round_delta; + + X = X >> (AB_BITS - INTER_BITS); + Y = Y >> (AB_BITS - INTER_BITS); + + sx = convert_int4(convert_short4(X >> INTER_BITS)); + sy = convert_int4(convert_short4(Y >> INTER_BITS)); + ax = convert_short4(X & (INTER_TAB_SIZE-1)); + ay = convert_short4(Y & (INTER_TAB_SIZE-1)); + + uchar4 v0, v1, v2,v3; + int4 scon0, scon1, scon2, scon3; + int4 spos0, spos1, spos2, spos3; + + scon0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows); + scon1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows); + scon2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows); + scon3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows); + spos0 = src_offset + sy * srcStep + sx; + spos1 = src_offset + sy * srcStep + sx + 1; + spos2 = src_offset + (sy+1) * srcStep + sx; + spos3 = src_offset + (sy+1) * srcStep + sx + 1; + + v0.s0 = scon0.s0 ? src[spos0.s0] : 0; + v1.s0 = scon1.s0 ? src[spos1.s0] : 0; + v2.s0 = scon2.s0 ? src[spos2.s0] : 0; + v3.s0 = scon3.s0 ? src[spos3.s0] : 0; + + v0.s1 = scon0.s1 ? src[spos0.s1] : 0; + v1.s1 = scon1.s1 ? src[spos1.s1] : 0; + v2.s1 = scon2.s1 ? src[spos2.s1] : 0; + v3.s1 = scon3.s1 ? src[spos3.s1] : 0; + + v0.s2 = scon0.s2 ? src[spos0.s2] : 0; + v1.s2 = scon1.s2 ? src[spos1.s2] : 0; + v2.s2 = scon2.s2 ? src[spos2.s2] : 0; + v3.s2 = scon3.s2 ? src[spos3.s2] : 0; + + v0.s3 = scon0.s3 ? src[spos0.s3] : 0; + v1.s3 = scon1.s3 ? src[spos1.s3] : 0; + v2.s3 = scon2.s3 ? src[spos2.s3] : 0; + v3.s3 = scon3.s3 ? src[spos3.s3] : 0; + + short4 itab0, itab1, itab2, itab3; + float4 taby, tabx; + taby = INTER_SCALE * convert_float4(ay); + tabx = INTER_SCALE * convert_float4(ax); + + itab0 = convert_short4_sat(( (1.0f-taby)*(1.0f-tabx) * (float4)INTER_REMAP_COEF_SCALE )); + itab1 = convert_short4_sat(( (1.0f-taby)*tabx * (float4)INTER_REMAP_COEF_SCALE )); + itab2 = convert_short4_sat(( taby*(1.0f-tabx) * (float4)INTER_REMAP_COEF_SCALE )); + itab3 = convert_short4_sat(( taby*tabx * (float4)INTER_REMAP_COEF_SCALE )); + + + int4 val; + uchar4 tval; + val = convert_int4(v0) * convert_int4(itab0) + convert_int4(v1) * convert_int4(itab1) + + convert_int4(v2) * convert_int4(itab2) + convert_int4(v3) * convert_int4(itab3); + tval = convert_uchar4_sat ( (val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ; + + __global uchar4 * d =(__global uchar4 *)(dst+dst_offset+dy*dstStep+dx); + uchar4 dval = *d; + DX = (int4)(dx, dx+1, dx+2, dx+3); + int4 dcon = DX >= 0 && DX < dst_cols && dy >= 0 && dy < dst_rows; + dval = convert_uchar4(dcon != 0) ? tval : dval; + *d = dval; + } +} + +__kernel void warpAffineCubic_C1_D0(__global uchar * src, __global uchar * dst, int src_cols, int src_rows, + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) +{ + int dx = get_global_id(0); + int dy = get_global_id(1); + + if( dx < threadCols && dy < dst_rows) + { + int round_delta = ((AB_SCALE>>INTER_BITS)>>1); + + int X0 = rint(M[0] * dx * AB_SCALE); + int Y0 = rint(M[3] * dx * AB_SCALE); + X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta; + Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta; + int X = X0 >> (AB_BITS - INTER_BITS); + int Y = Y0 >> (AB_BITS - INTER_BITS); + + short sx = (short)(X >> INTER_BITS) - 1; + short sy = (short)(Y >> INTER_BITS) - 1; + short ay = (short)(Y & (INTER_TAB_SIZE-1)); + short ax = (short)(X & (INTER_TAB_SIZE-1)); + + uchar v[16]; + int i, j; + +#pragma unroll 4 + for(i=0; i<4; i++) + for(j=0; j<4; j++) + { + v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? src[src_offset+(sy+i) * srcStep + (sx+j)] : 0; + } + + short itab[16]; + float tab1y[4], tab1x[4]; + float axx, ayy; + + ayy = 1.f/INTER_TAB_SIZE * ay; + axx = 1.f/INTER_TAB_SIZE * ax; + interpolateCubic(ayy, tab1y); + interpolateCubic(axx, tab1x); + int isum = 0; + +#pragma unroll 16 + for( i=0; i<16; i++ ) + { + F v = tab1y[(i>>2)] * tab1x[(i&3)]; + isum += itab[i] = convert_short_sat( rint( v * INTER_REMAP_COEF_SCALE ) ); + } + + if( isum != INTER_REMAP_COEF_SCALE ) + { + int k1, k2; + int diff = isum - INTER_REMAP_COEF_SCALE; + int Mk1=2, Mk2=2, mk1=2, mk2=2; + for( k1 = 2; k1 < 4; k1++ ) + for( k2 = 2; k2 < 4; k2++ ) + { + if( itab[(k1<<2)+k2] < itab[(mk1<<2)+mk2] ) + mk1 = k1, mk2 = k2; + else if( itab[(k1<<2)+k2] > itab[(Mk1<<2)+Mk2] ) + Mk1 = k1, Mk2 = k2; + } + diff<0 ? (itab[(Mk1<<2)+Mk2]=(short)(itab[(Mk1<<2)+Mk2]-diff)) : (itab[(mk1<<2)+mk2]=(short)(itab[(mk1<<2)+mk2]-diff)); + } + + if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) + { + int sum=0; + for ( i =0; i<16; i++ ) + { + sum += v[i] * itab[i] ; + } + dst[dst_offset+dy*dstStep+dx] = convert_uchar_sat( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ; + } + } +} + +/**********************************************8UC4********************************************* +***********************************************************************************************/ + +__kernel void warpAffineNN_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, int src_cols, int src_rows, + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) +{ + int dx = get_global_id(0); + int dy = get_global_id(1); + + if( dx < threadCols && dy < dst_rows) + { + int round_delta = (AB_SCALE >> 1); + + int X0 = rint(M[0] * dx * AB_SCALE); + int Y0 = rint(M[3] * dx * AB_SCALE); + X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta; + Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta; + + int sx0 = (short)(X0 >> AB_BITS); + int sy0 = (short)(Y0 >> AB_BITS); + + if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) + dst[(dst_offset>>2)+dy*(dstStep>>2)+dx]= (sx0>=0 && sx0=0 && sy0>2)+sy0*(srcStep>>2)+sx0] : (uchar4)0; + } +} + +__kernel void warpAffineLinear_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, int src_cols, int src_rows, + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) +{ + int dx = get_global_id(0); + int dy = get_global_id(1); + + + if( dx < threadCols && dy < dst_rows) + { + int round_delta = AB_SCALE/INTER_TAB_SIZE/2; + + src_offset = (src_offset>>2); + srcStep = (srcStep>>2); + + int tmp = (dx << AB_BITS); + int X0 = rint(M[0] * tmp); + int Y0 = rint(M[3] * tmp); + X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta; + Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta; + X0 = X0 >> (AB_BITS - INTER_BITS); + Y0 = Y0 >> (AB_BITS - INTER_BITS); + + short sx0 = (short)(X0 >> INTER_BITS); + short sy0 = (short)(Y0 >> INTER_BITS); + short ax0 = (short)(X0 & (INTER_TAB_SIZE-1)); + short ay0 = (short)(Y0 & (INTER_TAB_SIZE-1)); + + int4 v0, v1, v2, v3; + + v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? convert_int4(src[src_offset+sy0 * srcStep + sx0]) : 0; + v1 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0 >= 0 && sy0 < src_rows) ? convert_int4(src[src_offset+sy0 * srcStep + sx0+1]) : 0; + v2 = (sx0 >= 0 && sx0 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? convert_int4(src[src_offset+(sy0+1) * srcStep + sx0]) : 0; + v3 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? convert_int4(src[src_offset+(sy0+1) * srcStep + sx0+1]) : 0; + + int itab0, itab1, itab2, itab3; + float taby, tabx; + taby = 1.f/INTER_TAB_SIZE*ay0; + tabx = 1.f/INTER_TAB_SIZE*ax0; + + itab0 = convert_short_sat(rint( (1.0f-taby)*(1.0f-tabx) * INTER_REMAP_COEF_SCALE )); + itab1 = convert_short_sat(rint( (1.0f-taby)*tabx * INTER_REMAP_COEF_SCALE )); + itab2 = convert_short_sat(rint( taby*(1.0f-tabx) * INTER_REMAP_COEF_SCALE )); + itab3 = convert_short_sat(rint( taby*tabx * INTER_REMAP_COEF_SCALE )); + + int4 val; + val = v0 * itab0 + v1 * itab1 + v2 * itab2 + v3 * itab3; + + if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) + dst[(dst_offset>>2)+dy*(dstStep>>2)+dx] = convert_uchar4_sat ( (val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ; + } +} + +__kernel void warpAffineCubic_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, int src_cols, int src_rows, + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) +{ + int dx = get_global_id(0); + int dy = get_global_id(1); + + if( dx < threadCols && dy < dst_rows) + { + int round_delta = ((AB_SCALE>>INTER_BITS)>>1); + + src_offset = (src_offset>>2); + srcStep = (srcStep>>2); + dst_offset = (dst_offset>>2); + dstStep = (dstStep>>2); + + int tmp = (dx << AB_BITS); + int X0 = rint(M[0] * tmp); + int Y0 = rint(M[3] * tmp); + X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta; + Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta; + X0 = X0 >> (AB_BITS - INTER_BITS); + Y0 = Y0 >> (AB_BITS - INTER_BITS); + + int sx = (short)(X0 >> INTER_BITS) - 1; + int sy = (short)(Y0 >> INTER_BITS) - 1; + int ay = (short)(Y0 & (INTER_TAB_SIZE-1)); + int ax = (short)(X0 & (INTER_TAB_SIZE-1)); + + uchar4 v[16]; + int i,j; +#pragma unroll 4 + for(i=0; i<4; i++) + for(j=0; j<4; j++) + { + v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? (src[src_offset+(sy+i) * srcStep + (sx+j)]) : (uchar4)0; + } + int itab[16]; + float tab1y[4], tab1x[4]; + float axx, ayy; + + ayy = INTER_SCALE * ay; + axx = INTER_SCALE * ax; + interpolateCubic(ayy, tab1y); + interpolateCubic(axx, tab1x); + int isum = 0; + +#pragma unroll 16 + for( i=0; i<16; i++ ) + { + float tmp; + tmp = tab1y[(i>>2)] * tab1x[(i&3)] * INTER_REMAP_COEF_SCALE; + itab[i] = rint(tmp); + isum += itab[i]; + } + + if( isum != INTER_REMAP_COEF_SCALE ) + { + int k1, k2; + int diff = isum - INTER_REMAP_COEF_SCALE; + int Mk1=2, Mk2=2, mk1=2, mk2=2; + + for( k1 = 2; k1 < 4; k1++ ) + for( k2 = 2; k2 < 4; k2++ ) + { + + if( itab[(k1<<2)+k2] < itab[(mk1<<2)+mk2] ) + mk1 = k1, mk2 = k2; + else if( itab[(k1<<2)+k2] > itab[(Mk1<<2)+Mk2] ) + Mk1 = k1, Mk2 = k2; + } + + diff<0 ? (itab[(Mk1<<2)+Mk2]=(short)(itab[(Mk1<<2)+Mk2]-diff)) : (itab[(mk1<<2)+mk2]=(short)(itab[(mk1<<2)+mk2]-diff)); + } + + if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) + { + int4 sum=0; + for ( i =0; i<16; i++ ) + { + sum += convert_int4(v[i]) * itab[i]; + } + dst[dst_offset+dy*dstStep+dx] = convert_uchar4_sat( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ; + } + } +} + + +/**********************************************32FC1******************************************** +***********************************************************************************************/ + +__kernel void warpAffineNN_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows, + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) +{ + int dx = get_global_id(0); + int dy = get_global_id(1); + + if( dx < threadCols && dy < dst_rows) + { + int round_delta = AB_SCALE/2; + + int X0 = rint(M[0] * dx * AB_SCALE); + int Y0 = rint(M[3] * dx * AB_SCALE); + X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta; + Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta; + + short sx0 = (short)(X0 >> AB_BITS); + short sy0 = (short)(Y0 >> AB_BITS); + + if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) + dst[(dst_offset>>2)+dy*dstStep+dx]= (sx0>=0 && sx0=0 && sy0>2)+sy0*srcStep+sx0] : 0; + } +} + +__kernel void warpAffineLinear_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows, + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) +{ + int dx = get_global_id(0); + int dy = get_global_id(1); + + if( dx < threadCols && dy < dst_rows) + { + int round_delta = AB_SCALE/INTER_TAB_SIZE/2; + + src_offset = (src_offset>>2); + + int X0 = rint(M[0] * dx * AB_SCALE); + int Y0 = rint(M[3] * dx * AB_SCALE); + X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta; + Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta; + X0 = X0 >> (AB_BITS - INTER_BITS); + Y0 = Y0 >> (AB_BITS - INTER_BITS); + + short sx0 = (short)(X0 >> INTER_BITS); + short sy0 = (short)(Y0 >> INTER_BITS); + short ax0 = (short)(X0 & (INTER_TAB_SIZE-1)); + short ay0 = (short)(Y0 & (INTER_TAB_SIZE-1)); + + float v0, v1, v2, v3; + + v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0] : 0; + v1 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0+1] : 0; + v2 = (sx0 >= 0 && sx0 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0] : 0; + v3 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0+1] : 0; + + float tab[4]; + float taby[2], tabx[2]; + taby[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay0; + taby[1] = 1.f/INTER_TAB_SIZE*ay0; + tabx[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax0; + tabx[1] = 1.f/INTER_TAB_SIZE*ax0; + + tab[0] = taby[0] * tabx[0]; + tab[1] = taby[0] * tabx[1]; + tab[2] = taby[1] * tabx[0]; + tab[3] = taby[1] * tabx[1]; + + float sum = 0; + sum += v0 * tab[0] + v1 * tab[1] + v2 * tab[2] + v3 * tab[3]; + if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) + dst[(dst_offset>>2)+dy*dstStep+dx] = sum; + } +} + +__kernel void warpAffineCubic_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows, + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) +{ + int dx = get_global_id(0); + int dy = get_global_id(1); + + if( dx < threadCols && dy < dst_rows) + { + int round_delta = AB_SCALE/INTER_TAB_SIZE/2; + + src_offset = (src_offset>>2); + dst_offset = (dst_offset>>2); + + int X0 = rint(M[0] * dx * AB_SCALE); + int Y0 = rint(M[3] * dx * AB_SCALE); + X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta; + Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta; + X0 = X0 >> (AB_BITS - INTER_BITS); + Y0 = Y0 >> (AB_BITS - INTER_BITS); + + short sx = (short)(X0 >> INTER_BITS) - 1; + short sy = (short)(Y0 >> INTER_BITS) - 1; + short ay = (short)(Y0 & (INTER_TAB_SIZE-1)); + short ax = (short)(X0 & (INTER_TAB_SIZE-1)); + + float v[16]; + int i; + + for(i=0; i<16; i++) + v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : 0; + + float tab[16]; + float tab1y[4], tab1x[4]; + float axx, ayy; + + ayy = 1.f/INTER_TAB_SIZE * ay; + axx = 1.f/INTER_TAB_SIZE * ax; + interpolateCubic(ayy, tab1y); + interpolateCubic(axx, tab1x); + +#pragma unroll 4 + for( i=0; i<16; i++ ) + { + tab[i] = tab1y[(i>>2)] * tab1x[(i&3)]; + } + + if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) + { + float sum = 0; +#pragma unroll 4 + for ( i =0; i<16; i++ ) + { + sum += v[i] * tab[i]; + } + dst[dst_offset+dy*dstStep+dx] = sum; + + } + } +} + + +/**********************************************32FC4******************************************** +***********************************************************************************************/ + +__kernel void warpAffineNN_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows, + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) +{ + int dx = get_global_id(0); + int dy = get_global_id(1); + + if( dx < threadCols && dy < dst_rows) + { + int round_delta = AB_SCALE/2; + + int X0 = rint(M[0] * dx * AB_SCALE); + int Y0 = rint(M[3] * dx * AB_SCALE); + X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta; + Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta; + + short sx0 = (short)(X0 >> AB_BITS); + short sy0 = (short)(Y0 >> AB_BITS); + + if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) + dst[(dst_offset>>4)+dy*(dstStep>>2)+dx]= (sx0>=0 && sx0=0 && sy0>4)+sy0*(srcStep>>2)+sx0] : (float4)0; + } +} + +__kernel void warpAffineLinear_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows, + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) +{ + int dx = get_global_id(0); + int dy = get_global_id(1); + + if( dx < threadCols && dy < dst_rows) + { + int round_delta = AB_SCALE/INTER_TAB_SIZE/2; + + src_offset = (src_offset>>4); + dst_offset = (dst_offset>>4); + srcStep = (srcStep>>2); + dstStep = (dstStep>>2); + + int X0 = rint(M[0] * dx * AB_SCALE); + int Y0 = rint(M[3] * dx * AB_SCALE); + X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta; + Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta; + X0 = X0 >> (AB_BITS - INTER_BITS); + Y0 = Y0 >> (AB_BITS - INTER_BITS); + + short sx0 = (short)(X0 >> INTER_BITS); + short sy0 = (short)(Y0 >> INTER_BITS); + short ax0 = (short)(X0 & (INTER_TAB_SIZE-1)); + short ay0 = (short)(Y0 & (INTER_TAB_SIZE-1)); + + float4 v0, v1, v2, v3; + + v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0] : (float4)0; + v1 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0+1] : (float4)0; + v2 = (sx0 >= 0 && sx0 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0] : (float4)0; + v3 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0+1] : (float4)0; + + float tab[4]; + float taby[2], tabx[2]; + taby[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay0; + taby[1] = 1.f/INTER_TAB_SIZE*ay0; + tabx[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax0; + tabx[1] = 1.f/INTER_TAB_SIZE*ax0; + + tab[0] = taby[0] * tabx[0]; + tab[1] = taby[0] * tabx[1]; + tab[2] = taby[1] * tabx[0]; + tab[3] = taby[1] * tabx[1]; + + float4 sum = 0; + sum += v0 * tab[0] + v1 * tab[1] + v2 * tab[2] + v3 * tab[3]; + if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) + dst[dst_offset+dy*dstStep+dx] = sum; + } +} + +__kernel void warpAffineCubic_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows, + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) +{ + int dx = get_global_id(0); + int dy = get_global_id(1); + + if( dx < threadCols && dy < dst_rows) + { + int round_delta = AB_SCALE/INTER_TAB_SIZE/2; + + src_offset = (src_offset>>4); + dst_offset = (dst_offset>>4); + srcStep = (srcStep>>2); + dstStep = (dstStep>>2); + + int X0 = rint(M[0] * dx * AB_SCALE); + int Y0 = rint(M[3] * dx * AB_SCALE); + X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta; + Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta; + X0 = X0 >> (AB_BITS - INTER_BITS); + Y0 = Y0 >> (AB_BITS - INTER_BITS); + + short sx = (short)(X0 >> INTER_BITS) - 1; + short sy = (short)(Y0 >> INTER_BITS) - 1; + short ay = (short)(Y0 & (INTER_TAB_SIZE-1)); + short ax = (short)(X0 & (INTER_TAB_SIZE-1)); + + float4 v[16]; + int i; + + for(i=0; i<16; i++) + v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : (float4)0; + + float tab[16]; + float tab1y[4], tab1x[4]; + float axx, ayy; + + ayy = 1.f/INTER_TAB_SIZE * ay; + axx = 1.f/INTER_TAB_SIZE * ax; + interpolateCubic(ayy, tab1y); + interpolateCubic(axx, tab1x); + +#pragma unroll 4 + for( i=0; i<16; i++ ) + { + tab[i] = tab1y[(i>>2)] * tab1x[(i&3)]; + } + + if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) + { + float4 sum = 0; +#pragma unroll 4 + for ( i =0; i<16; i++ ) + { + sum += v[i] * tab[i]; + } + dst[dst_offset+dy*dstStep+dx] = sum; + + } + } +} diff --git a/modules/imgproc/src/opencl/warpperspective.cl b/modules/imgproc/src/opencl/warpperspective.cl new file mode 100644 index 0000000000..43863c1517 --- /dev/null +++ b/modules/imgproc/src/opencl/warpperspective.cl @@ -0,0 +1,688 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Zhang Ying, zhangying913@gmail.com +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors as is and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + + +//wrapPerspective kernel +//support data types: CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4, and three interpolation methods: NN, Linear, Cubic. + +#if defined (DOUBLE_SUPPORT) +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64:enable +#elif defined (cl_amd_fp64) +#pragma OPENCL EXTENSION cl_amd_fp64:enable +#endif +typedef double F; +typedef double4 F4; +#define convert_F4 convert_double4 +#else +typedef float F; +typedef float4 F4; +#define convert_F4 convert_float4 +#endif + + +#define INTER_BITS 5 +#define INTER_TAB_SIZE (1 << INTER_BITS) +#define INTER_SCALE 1.f/INTER_TAB_SIZE +#define AB_BITS max(10, (int)INTER_BITS) +#define AB_SCALE (1 << AB_BITS) +#define INTER_REMAP_COEF_BITS 15 +#define INTER_REMAP_COEF_SCALE (1 << INTER_REMAP_COEF_BITS) + +inline void interpolateCubic( float x, float* coeffs ) +{ + const float A = -0.75f; + + coeffs[0] = ((A*(x + 1.f) - 5.0f*A)*(x + 1.f) + 8.0f*A)*(x + 1.f) - 4.0f*A; + coeffs[1] = ((A + 2.f)*x - (A + 3.f))*x*x + 1.f; + coeffs[2] = ((A + 2.f)*(1.f - x) - (A + 3.f))*(1.f - x)*(1.f - x) + 1.f; + coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2]; +} + + +/**********************************************8UC1********************************************* +***********************************************************************************************/ +__kernel void warpPerspectiveNN_C1_D0(__global uchar const * restrict src, __global uchar * dst, int src_cols, int src_rows, + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) +{ + int dx = get_global_id(0); + int dy = get_global_id(1); + + if( dx < threadCols && dy < dst_rows) + { + dx = (dx<<2) - (dst_offset&3); + + F4 DX = (F4)(dx, dx+1, dx+2, dx+3); + F4 X0 = M[0]*DX + M[1]*dy + M[2]; + F4 Y0 = M[3]*DX + M[4]*dy + M[5]; + F4 W = M[6]*DX + M[7]*dy + M[8],one=1,zero=0; + W = (W!=zero) ? one/W : zero; + short4 X = convert_short4(rint(X0*W)); + short4 Y = convert_short4(rint(Y0*W)); + int4 sx = convert_int4(X); + int4 sy = convert_int4(Y); + + int4 DXD = (int4)(dx, dx+1, dx+2, dx+3); + __global uchar4 * d = (__global uchar4 *)(dst+dst_offset+dy*dstStep+dx); + uchar4 dval = *d; + int4 dcon = DXD >= 0 && DXD < dst_cols && dy >= 0 && dy < dst_rows; + int4 scon = sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows; + int4 spos = src_offset + sy * srcStep + sx; + uchar4 sval; + sval.s0 = scon.s0 ? src[spos.s0] : 0; + sval.s1 = scon.s1 ? src[spos.s1] : 0; + sval.s2 = scon.s2 ? src[spos.s2] : 0; + sval.s3 = scon.s3 ? src[spos.s3] : 0; + dval = convert_uchar4(dcon) != (uchar4)(0,0,0,0) ? sval : dval; + *d = dval; + } +} + +__kernel void warpPerspectiveLinear_C1_D0(__global const uchar * restrict src, __global uchar * dst, + int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep, + int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols ) +{ + int dx = get_global_id(0); + int dy = get_global_id(1); + + if( dx < threadCols && dy < dst_rows) + { + F X0 = M[0]*dx + M[1]*dy + M[2]; + F Y0 = M[3]*dx + M[4]*dy + M[5]; + F W = M[6]*dx + M[7]*dy + M[8]; + W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0; + int X = rint(X0*W); + int Y = rint(Y0*W); + + int sx = (short)(X >> INTER_BITS); + int sy = (short)(Y >> INTER_BITS); + int ay = (short)(Y & (INTER_TAB_SIZE-1)); + int ax = (short)(X & (INTER_TAB_SIZE-1)); + + uchar v[4]; + int i; +#pragma unroll 4 + for(i=0; i<4; i++) + v[i] = (sx+(i&1) >= 0 && sx+(i&1) < src_cols && sy+(i>>1) >= 0 && sy+(i>>1) < src_rows) ? src[src_offset + (sy+(i>>1)) * srcStep + (sx+(i&1))] : (uchar)0; + + short itab[4]; + float tab1y[2], tab1x[2]; + tab1y[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay; + tab1y[1] = 1.f/INTER_TAB_SIZE*ay; + tab1x[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax; + tab1x[1] = 1.f/INTER_TAB_SIZE*ax; + +#pragma unroll 4 + for(i=0; i<4; i++) + { + float v = tab1y[(i>>1)] * tab1x[(i&1)]; + itab[i] = convert_short_sat(rint( v * INTER_REMAP_COEF_SCALE )); + } + if(dx >=0 && dx < dst_cols && dy >= 0 && dy < dst_rows) + { + int sum = 0; + for ( i =0; i<4; i++ ) + { + sum += v[i] * itab[i] ; + } + dst[dst_offset+dy*dstStep+dx] = convert_uchar_sat ( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ; + } + } +} + +__kernel void warpPerspectiveCubic_C1_D0(__global uchar * src, __global uchar * dst, int src_cols, int src_rows, + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) +{ + int dx = get_global_id(0); + int dy = get_global_id(1); + + if( dx < threadCols && dy < dst_rows) + { + F X0 = M[0]*dx + M[1]*dy + M[2]; + F Y0 = M[3]*dx + M[4]*dy + M[5]; + F W = M[6]*dx + M[7]*dy + M[8]; + W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0; + int X = rint(X0*W); + int Y = rint(Y0*W); + + short sx = (short)(X >> INTER_BITS) - 1; + short sy = (short)(Y >> INTER_BITS) - 1; + short ay = (short)(Y & (INTER_TAB_SIZE-1)); + short ax = (short)(X & (INTER_TAB_SIZE-1)); + + uchar v[16]; + int i, j; + +#pragma unroll 4 + for(i=0; i<4; i++) + for(j=0; j<4; j++) + { + v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? src[src_offset+(sy+i) * srcStep + (sx+j)] : (uchar)0; + } + + short itab[16]; + float tab1y[4], tab1x[4]; + float axx, ayy; + + ayy = 1.f/INTER_TAB_SIZE * ay; + axx = 1.f/INTER_TAB_SIZE * ax; + interpolateCubic(ayy, tab1y); + interpolateCubic(axx, tab1x); + + int isum = 0; +#pragma unroll 16 + for( i=0; i<16; i++ ) + { + F v = tab1y[(i>>2)] * tab1x[(i&3)]; + isum += itab[i] = convert_short_sat( rint( v * INTER_REMAP_COEF_SCALE ) ); + } + if( isum != INTER_REMAP_COEF_SCALE ) + { + int k1, k2; + int diff = isum - INTER_REMAP_COEF_SCALE; + int Mk1=2, Mk2=2, mk1=2, mk2=2; + for( k1 = 2; k1 < 4; k1++ ) + for( k2 = 2; k2 < 4; k2++ ) + { + if( itab[(k1<<2)+k2] < itab[(mk1<<2)+mk2] ) + mk1 = k1, mk2 = k2; + else if( itab[(k1<<2)+k2] > itab[(Mk1<<2)+Mk2] ) + Mk1 = k1, Mk2 = k2; + } + diff<0 ? (itab[(Mk1<<2)+Mk2]=(short)(itab[(Mk1<<2)+Mk2]-diff)) : (itab[(mk1<<2)+mk2]=(short)(itab[(mk1<<2)+mk2]-diff)); + } + + + if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) + { + int sum=0; + for ( i =0; i<16; i++ ) + { + sum += v[i] * itab[i] ; + } + dst[dst_offset+dy*dstStep+dx] = convert_uchar_sat( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ; + } + } +} + +/**********************************************8UC4********************************************* +***********************************************************************************************/ + +__kernel void warpPerspectiveNN_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, + int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep, + int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols ) +{ + int dx = get_global_id(0); + int dy = get_global_id(1); + + if( dx < threadCols && dy < dst_rows) + { + + F X0 = M[0]*dx + M[1]*dy + M[2]; + F Y0 = M[3]*dx + M[4]*dy + M[5]; + F W = M[6]*dx + M[7]*dy + M[8]; + W = (W != 0.0) ? 1./W : 0.0; + int X = rint(X0*W); + int Y = rint(Y0*W); + short sx = (short)X; + short sy = (short)Y; + + if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) + dst[(dst_offset>>2)+dy*(dstStep>>2)+dx]= (sx>=0 && sx=0 && sy>2)+sy*(srcStep>>2)+sx] : (uchar4)0; + } +} + +__kernel void warpPerspectiveLinear_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, + int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep, + int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols ) +{ + int dx = get_global_id(0); + int dy = get_global_id(1); + + if( dx < threadCols && dy < dst_rows) + { + src_offset = (src_offset>>2); + srcStep = (srcStep>>2); + + F X0 = M[0]*dx + M[1]*dy + M[2]; + F Y0 = M[3]*dx + M[4]*dy + M[5]; + F W = M[6]*dx + M[7]*dy + M[8]; + W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0; + int X = rint(X0*W); + int Y = rint(Y0*W); + + short sx = (short)(X >> INTER_BITS); + short sy = (short)(Y >> INTER_BITS); + short ay = (short)(Y & (INTER_TAB_SIZE-1)); + short ax = (short)(X & (INTER_TAB_SIZE-1)); + + + int4 v0, v1, v2, v3; + + v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ? convert_int4(src[src_offset+sy * srcStep + sx]) : (int4)0; + v1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows) ? convert_int4(src[src_offset+sy * srcStep + sx+1]) : (int4)0; + v2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? convert_int4(src[src_offset+(sy+1) * srcStep + sx]) : (int4)0; + v3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? convert_int4(src[src_offset+(sy+1) * srcStep + sx+1]) : (int4)0; + + int itab0, itab1, itab2, itab3; + float taby, tabx; + taby = 1.f/INTER_TAB_SIZE*ay; + tabx = 1.f/INTER_TAB_SIZE*ax; + + itab0 = convert_short_sat(rint( (1.0f-taby)*(1.0f-tabx) * INTER_REMAP_COEF_SCALE )); + itab1 = convert_short_sat(rint( (1.0f-taby)*tabx * INTER_REMAP_COEF_SCALE )); + itab2 = convert_short_sat(rint( taby*(1.0f-tabx) * INTER_REMAP_COEF_SCALE )); + itab3 = convert_short_sat(rint( taby*tabx * INTER_REMAP_COEF_SCALE )); + + int4 val; + val = v0 * itab0 + v1 * itab1 + v2 * itab2 + v3 * itab3; + + if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) + dst[(dst_offset>>2)+dy*(dstStep>>2)+dx] = convert_uchar4_sat ( (val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ; + } +} + +__kernel void warpPerspectiveCubic_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, + int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep, + int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols ) +{ + int dx = get_global_id(0); + int dy = get_global_id(1); + + if( dx < threadCols && dy < dst_rows) + { + src_offset = (src_offset>>2); + srcStep = (srcStep>>2); + dst_offset = (dst_offset>>2); + dstStep = (dstStep>>2); + + F X0 = M[0]*dx + M[1]*dy + M[2]; + F Y0 = M[3]*dx + M[4]*dy + M[5]; + F W = M[6]*dx + M[7]*dy + M[8]; + W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0; + int X = rint(X0*W); + int Y = rint(Y0*W); + + short sx = (short)(X >> INTER_BITS) - 1; + short sy = (short)(Y >> INTER_BITS) - 1; + short ay = (short)(Y & (INTER_TAB_SIZE-1)); + short ax = (short)(X & (INTER_TAB_SIZE-1)); + + uchar4 v[16]; + int i,j; +#pragma unroll 4 + for(i=0; i<4; i++) + for(j=0; j<4; j++) + { + v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? (src[src_offset+(sy+i) * srcStep + (sx+j)]) : (uchar4)0; + } + int itab[16]; + float tab1y[4], tab1x[4]; + float axx, ayy; + + ayy = INTER_SCALE * ay; + axx = INTER_SCALE * ax; + interpolateCubic(ayy, tab1y); + interpolateCubic(axx, tab1x); + int isum = 0; + +#pragma unroll 16 + for( i=0; i<16; i++ ) + { + float tmp; + tmp = tab1y[(i>>2)] * tab1x[(i&3)] * INTER_REMAP_COEF_SCALE; + itab[i] = rint(tmp); + isum += itab[i]; + } + + if( isum != INTER_REMAP_COEF_SCALE ) + { + int k1, k2; + int diff = isum - INTER_REMAP_COEF_SCALE; + int Mk1=2, Mk2=2, mk1=2, mk2=2; + + for( k1 = 2; k1 < 4; k1++ ) + for( k2 = 2; k2 < 4; k2++ ) + { + + if( itab[(k1<<2)+k2] < itab[(mk1<<2)+mk2] ) + mk1 = k1, mk2 = k2; + else if( itab[(k1<<2)+k2] > itab[(Mk1<<2)+Mk2] ) + Mk1 = k1, Mk2 = k2; + } + + diff<0 ? (itab[(Mk1<<2)+Mk2]=(short)(itab[(Mk1<<2)+Mk2]-diff)) : (itab[(mk1<<2)+mk2]=(short)(itab[(mk1<<2)+mk2]-diff)); + } + + if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) + { + int4 sum=0; + for ( i =0; i<16; i++ ) + { + sum += convert_int4(v[i]) * itab[i]; + } + dst[dst_offset+dy*dstStep+dx] = convert_uchar4_sat( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ; + } + } +} + + +/**********************************************32FC1******************************************** +***********************************************************************************************/ + +__kernel void warpPerspectiveNN_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows, + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) +{ + int dx = get_global_id(0); + int dy = get_global_id(1); + + if( dx < threadCols && dy < dst_rows) + { + F X0 = M[0]*dx + M[1]*dy + M[2]; + F Y0 = M[3]*dx + M[4]*dy + M[5]; + F W = M[6]*dx + M[7]*dy + M[8]; + W = (W != 0.0) ? 1./W : 0.0; + int X = rint(X0*W); + int Y = rint(Y0*W); + short sx = (short)X; + short sy = (short)Y; + + if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) + dst[(dst_offset>>2)+dy*dstStep+dx]= (sx>=0 && sx=0 && sy>2)+sy*srcStep+sx] : 0; + } +} + +__kernel void warpPerspectiveLinear_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows, + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) +{ + int dx = get_global_id(0); + int dy = get_global_id(1); + + if( dx < threadCols && dy < dst_rows) + { + src_offset = (src_offset>>2); + + F X0 = M[0]*dx + M[1]*dy + M[2]; + F Y0 = M[3]*dx + M[4]*dy + M[5]; + F W = M[6]*dx + M[7]*dy + M[8]; + W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0; + int X = rint(X0*W); + int Y = rint(Y0*W); + + short sx = (short)(X >> INTER_BITS); + short sy = (short)(Y >> INTER_BITS); + short ay = (short)(Y & (INTER_TAB_SIZE-1)); + short ax = (short)(X & (INTER_TAB_SIZE-1)); + + float v0, v1, v2, v3; + + v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ? src[src_offset+sy * srcStep + sx] : (float)0; + v1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows) ? src[src_offset+sy * srcStep + sx+1] : (float)0; + v2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? src[src_offset+(sy+1) * srcStep + sx] : (float)0; + v3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? src[src_offset+(sy+1) * srcStep + sx+1] : (float)0; + + float tab[4]; + float taby[2], tabx[2]; + taby[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay; + taby[1] = 1.f/INTER_TAB_SIZE*ay; + tabx[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax; + tabx[1] = 1.f/INTER_TAB_SIZE*ax; + + tab[0] = taby[0] * tabx[0]; + tab[1] = taby[0] * tabx[1]; + tab[2] = taby[1] * tabx[0]; + tab[3] = taby[1] * tabx[1]; + + float sum = 0; + sum += v0 * tab[0] + v1 * tab[1] + v2 * tab[2] + v3 * tab[3]; + if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) + dst[(dst_offset>>2)+dy*dstStep+dx] = sum; + } +} + +__kernel void warpPerspectiveCubic_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows, + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) +{ + int dx = get_global_id(0); + int dy = get_global_id(1); + + if( dx < threadCols && dy < dst_rows) + { + src_offset = (src_offset>>2); + dst_offset = (dst_offset>>2); + + F X0 = M[0]*dx + M[1]*dy + M[2]; + F Y0 = M[3]*dx + M[4]*dy + M[5]; + F W = M[6]*dx + M[7]*dy + M[8]; + W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0; + int X = rint(X0*W); + int Y = rint(Y0*W); + + short sx = (short)(X >> INTER_BITS) - 1; + short sy = (short)(Y >> INTER_BITS) - 1; + short ay = (short)(Y & (INTER_TAB_SIZE-1)); + short ax = (short)(X & (INTER_TAB_SIZE-1)); + + float v[16]; + int i; + + for(i=0; i<16; i++) + v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : (float)0; + + float tab[16]; + float tab1y[4], tab1x[4]; + float axx, ayy; + + ayy = 1.f/INTER_TAB_SIZE * ay; + axx = 1.f/INTER_TAB_SIZE * ax; + interpolateCubic(ayy, tab1y); + interpolateCubic(axx, tab1x); + +#pragma unroll 4 + for( i=0; i<16; i++ ) + { + tab[i] = tab1y[(i>>2)] * tab1x[(i&3)]; + } + + if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) + { + float sum = 0; +#pragma unroll 4 + for ( i =0; i<16; i++ ) + { + sum += v[i] * tab[i]; + } + dst[dst_offset+dy*dstStep+dx] = sum; + + } + } +} + + +/**********************************************32FC4******************************************** +***********************************************************************************************/ + +__kernel void warpPerspectiveNN_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows, + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) +{ + int dx = get_global_id(0); + int dy = get_global_id(1); + + if( dx < threadCols && dy < dst_rows) + { + F X0 = M[0]*dx + M[1]*dy + M[2]; + F Y0 = M[3]*dx + M[4]*dy + M[5]; + F W = M[6]*dx + M[7]*dy + M[8]; + W =(W != 0.0)? 1./W : 0.0; + int X = rint(X0*W); + int Y = rint(Y0*W); + short sx = (short)X; + short sy = (short)Y; + + if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) + dst[(dst_offset>>4)+dy*(dstStep>>2)+dx]= (sx>=0 && sx=0 && sy>4)+sy*(srcStep>>2)+sx] : (float)0; + } +} + +__kernel void warpPerspectiveLinear_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows, + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) +{ + int dx = get_global_id(0); + int dy = get_global_id(1); + + if( dx < threadCols && dy < dst_rows) + { + src_offset = (src_offset>>4); + dst_offset = (dst_offset>>4); + srcStep = (srcStep>>2); + dstStep = (dstStep>>2); + + F X0 = M[0]*dx + M[1]*dy + M[2]; + F Y0 = M[3]*dx + M[4]*dy + M[5]; + F W = M[6]*dx + M[7]*dy + M[8]; + W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0; + int X = rint(X0*W); + int Y = rint(Y0*W); + + short sx0 = (short)(X >> INTER_BITS); + short sy0 = (short)(Y >> INTER_BITS); + short ay0 = (short)(Y & (INTER_TAB_SIZE-1)); + short ax0 = (short)(X & (INTER_TAB_SIZE-1)); + + + float4 v0, v1, v2, v3; + + v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0] : (float4)0; + v1 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0+1] : (float4)0; + v2 = (sx0 >= 0 && sx0 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0] : (float4)0; + v3 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0+1] : (float4)0; + + float tab[4]; + float taby[2], tabx[2]; + taby[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay0; + taby[1] = 1.f/INTER_TAB_SIZE*ay0; + tabx[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax0; + tabx[1] = 1.f/INTER_TAB_SIZE*ax0; + + tab[0] = taby[0] * tabx[0]; + tab[1] = taby[0] * tabx[1]; + tab[2] = taby[1] * tabx[0]; + tab[3] = taby[1] * tabx[1]; + + float4 sum = 0; + sum += v0 * tab[0] + v1 * tab[1] + v2 * tab[2] + v3 * tab[3]; + if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) + dst[dst_offset+dy*dstStep+dx] = sum; + } +} + +__kernel void warpPerspectiveCubic_C4_D5(__global float4 * src, __global float4 * dst, + int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep, + int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols ) +{ + int dx = get_global_id(0); + int dy = get_global_id(1); + + if( dx < threadCols && dy < dst_rows ) + { + src_offset = (src_offset>>4); + dst_offset = (dst_offset>>4); + srcStep = (srcStep>>2); + dstStep = (dstStep>>2); + + F X0 = M[0]*dx + M[1]*dy + M[2]; + F Y0 = M[3]*dx + M[4]*dy + M[5]; + F W = M[6]*dx + M[7]*dy + M[8]; + W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0; + int X = rint(X0*W); + int Y = rint(Y0*W); + + short sx = (short)(X >> INTER_BITS)-1; + short sy = (short)(Y >> INTER_BITS)-1; + short ay = (short)(Y & (INTER_TAB_SIZE-1)); + short ax = (short)(X & (INTER_TAB_SIZE-1)); + + + float4 v[16]; + int i; + + for(i=0; i<16; i++) + v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : (float4)0; + + float tab[16]; + float tab1y[4], tab1x[4]; + float axx, ayy; + + ayy = 1.f/INTER_TAB_SIZE * ay; + axx = 1.f/INTER_TAB_SIZE * ax; + interpolateCubic(ayy, tab1y); + interpolateCubic(axx, tab1x); + +#pragma unroll 4 + for( i=0; i<16; i++ ) + { + tab[i] = tab1y[(i>>2)] * tab1x[(i&3)]; + } + + if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) + { + float4 sum = 0; +#pragma unroll 4 + for ( i =0; i<16; i++ ) + { + sum += v[i] * tab[i]; + } + dst[dst_offset+dy*dstStep+dx] = sum; + + } + } +} diff --git a/modules/imgproc/src/precomp.hpp b/modules/imgproc/src/precomp.hpp index a3bbd65dbb..b806eda50e 100644 --- a/modules/imgproc/src/precomp.hpp +++ b/modules/imgproc/src/precomp.hpp @@ -48,6 +48,8 @@ #include "opencv2/imgproc/imgproc_c.h" #include "opencv2/core/private.hpp" +#include "opencv2/core/ocl.hpp" +#include "opencl_kernels.hpp" #include #include diff --git a/modules/imgproc/test/test_imgproc_umat.cpp b/modules/imgproc/test/test_imgproc_umat.cpp new file mode 100644 index 0000000000..ca72d76430 --- /dev/null +++ b/modules/imgproc/test/test_imgproc_umat.cpp @@ -0,0 +1,81 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "test_precomp.hpp" +#include + +using namespace cv; +using namespace std; + +class CV_ImgprocUMatTest : public cvtest::BaseTest +{ +public: + CV_ImgprocUMatTest() {} + ~CV_ImgprocUMatTest() {} +protected: + void run(int) + { + string imgpath = string(ts->get_data_path()) + "shared/lena.png"; + Mat img = imread(imgpath, 1), gray, smallimg, result; + UMat uimg = img.getUMat(ACCESS_READ), ugray, usmallimg, uresult; + + cvtColor(img, gray, COLOR_BGR2GRAY); + resize(gray, smallimg, Size(), 0.75, 0.75, INTER_LINEAR); + equalizeHist(smallimg, result); + + cvtColor(uimg, ugray, COLOR_BGR2GRAY); + resize(ugray, usmallimg, Size(), 0.75, 0.75, INTER_LINEAR); + equalizeHist(usmallimg, uresult); + + imshow("orig", uimg); + imshow("small", usmallimg); + imshow("equalized gray", uresult); + waitKey(); + destroyWindow("orig"); + destroyWindow("small"); + destroyWindow("equalized gray"); + + ts->set_failed_test_info(cvtest::TS::OK); + } +}; + +TEST(Imgproc_UMat, regression) { CV_ImgprocUMatTest test; test.safe_run(); } diff --git a/modules/nonfree/src/precomp.hpp b/modules/nonfree/src/precomp.hpp index 1aeb1df434..204feaf717 100644 --- a/modules/nonfree/src/precomp.hpp +++ b/modules/nonfree/src/precomp.hpp @@ -52,6 +52,8 @@ #include "opencv2/nonfree/cuda.hpp" #include "opencv2/core/private.cuda.hpp" +#include "opencv2/core/ocl.hpp" + #include "opencv2/opencv_modules.hpp" #ifdef HAVE_OPENCV_CUDAARITHM diff --git a/modules/nonfree/src/surf.ocl.cpp b/modules/nonfree/src/surf.ocl.cpp index 20367ab98f..5f62668510 100644 --- a/modules/nonfree/src/surf.ocl.cpp +++ b/modules/nonfree/src/surf.ocl.cpp @@ -51,6 +51,8 @@ using namespace cv; using namespace cv::ocl; +static ProgramEntry surf = cv::ocl::nonfree::surf; + namespace cv { namespace ocl diff --git a/modules/objdetect/include/opencv2/objdetect.hpp b/modules/objdetect/include/opencv2/objdetect.hpp index f1b371610d..c3dee4a2bb 100644 --- a/modules/objdetect/include/opencv2/objdetect.hpp +++ b/modules/objdetect/include/opencv2/objdetect.hpp @@ -159,14 +159,14 @@ public: CV_WRAP virtual bool empty() const; CV_WRAP bool load( const String& filename ); virtual bool read( const FileNode& node ); - CV_WRAP virtual void detectMultiScale( const Mat& image, + CV_WRAP virtual void detectMultiScale( InputArray image, CV_OUT std::vector& objects, double scaleFactor = 1.1, int minNeighbors = 3, int flags = 0, Size minSize = Size(), Size maxSize = Size() ); - CV_WRAP virtual void detectMultiScale( const Mat& image, + CV_WRAP virtual void detectMultiScale( InputArray image, CV_OUT std::vector& objects, CV_OUT std::vector& numDetections, double scaleFactor=1.1, @@ -174,7 +174,7 @@ public: Size minSize=Size(), Size maxSize=Size() ); - CV_WRAP virtual void detectMultiScale( const Mat& image, + CV_WRAP virtual void detectMultiScale( InputArray image, CV_OUT std::vector& objects, CV_OUT std::vector& rejectLevels, CV_OUT std::vector& levelWeights, diff --git a/modules/objdetect/src/cascadedetect.cpp b/modules/objdetect/src/cascadedetect.cpp index 88f463faa0..92b685c5dd 100644 --- a/modules/objdetect/src/cascadedetect.cpp +++ b/modules/objdetect/src/cascadedetect.cpp @@ -1154,13 +1154,14 @@ void CascadeClassifier::detectMultiScaleNoGrouping( const Mat& image, std::vecto } } -void CascadeClassifier::detectMultiScale( const Mat& image, std::vector& objects, +void CascadeClassifier::detectMultiScale( InputArray _image, std::vector& objects, std::vector& rejectLevels, std::vector& levelWeights, double scaleFactor, int minNeighbors, int flags, Size minObjectSize, Size maxObjectSize, bool outputRejectLevels ) { + Mat image = _image.getMat(); CV_Assert( scaleFactor > 1 && image.depth() == CV_8U ); if( empty() ) @@ -1188,21 +1189,23 @@ void CascadeClassifier::detectMultiScale( const Mat& image, std::vector& o } } -void CascadeClassifier::detectMultiScale( const Mat& image, std::vector& objects, +void CascadeClassifier::detectMultiScale( InputArray _image, std::vector& objects, double scaleFactor, int minNeighbors, int flags, Size minObjectSize, Size maxObjectSize) { + Mat image = _image.getMat(); std::vector fakeLevels; std::vector fakeWeights; detectMultiScale( image, objects, fakeLevels, fakeWeights, scaleFactor, minNeighbors, flags, minObjectSize, maxObjectSize ); } -void CascadeClassifier::detectMultiScale( const Mat& image, std::vector& objects, +void CascadeClassifier::detectMultiScale( InputArray _image, std::vector& objects, std::vector& numDetections, double scaleFactor, int minNeighbors, int flags, Size minObjectSize, Size maxObjectSize ) { + Mat image = _image.getMat(); CV_Assert( scaleFactor > 1 && image.depth() == CV_8U ); if( empty() ) diff --git a/modules/objdetect/src/opencl/haarobjectdetect.cl b/modules/objdetect/src/opencl/haarobjectdetect.cl new file mode 100644 index 0000000000..5fa3533054 --- /dev/null +++ b/modules/objdetect/src/opencl/haarobjectdetect.cl @@ -0,0 +1,423 @@ +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Niko Li, newlife20080214@gmail.com +// Wang Weiyan, wangweiyanster@gmail.com +// Jia Haipeng, jiahaipeng95@gmail.com +// Nathan, liujun@multicorewareinc.com +// Peng Xiao, pengxiao@outlook.com +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors as is and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +// + +#pragma OPENCL EXTENSION cl_amd_printf : enable +#define CV_HAAR_FEATURE_MAX 3 + +#define calc_sum(rect,offset) (sum[(rect).p0+offset] - sum[(rect).p1+offset] - sum[(rect).p2+offset] + sum[(rect).p3+offset]) +#define calc_sum1(rect,offset,i) (sum[(rect).p0[i]+offset] - sum[(rect).p1[i]+offset] - sum[(rect).p2[i]+offset] + sum[(rect).p3[i]+offset]) + +typedef int sumtype; +typedef float sqsumtype; + +#ifndef STUMP_BASED +#define STUMP_BASED 1 +#endif + +typedef struct __attribute__((aligned (128) )) GpuHidHaarTreeNode +{ + int p[CV_HAAR_FEATURE_MAX][4] __attribute__((aligned (64))); + float weight[CV_HAAR_FEATURE_MAX]; + float threshold; + float alpha[3] __attribute__((aligned (16))); + int left __attribute__((aligned (4))); + int right __attribute__((aligned (4))); +} +GpuHidHaarTreeNode; + + +typedef struct __attribute__((aligned (32))) GpuHidHaarClassifier +{ + int count __attribute__((aligned (4))); + GpuHidHaarTreeNode* node __attribute__((aligned (8))); + float* alpha __attribute__((aligned (8))); +} +GpuHidHaarClassifier; + + +typedef struct __attribute__((aligned (64))) GpuHidHaarStageClassifier +{ + int count __attribute__((aligned (4))); + float threshold __attribute__((aligned (4))); + int two_rects __attribute__((aligned (4))); + int reserved0 __attribute__((aligned (8))); + int reserved1 __attribute__((aligned (8))); + int reserved2 __attribute__((aligned (8))); + int reserved3 __attribute__((aligned (8))); +} +GpuHidHaarStageClassifier; + + +typedef struct __attribute__((aligned (64))) GpuHidHaarClassifierCascade +{ + int count __attribute__((aligned (4))); + int is_stump_based __attribute__((aligned (4))); + int has_tilted_features __attribute__((aligned (4))); + int is_tree __attribute__((aligned (4))); + int pq0 __attribute__((aligned (4))); + int pq1 __attribute__((aligned (4))); + int pq2 __attribute__((aligned (4))); + int pq3 __attribute__((aligned (4))); + int p0 __attribute__((aligned (4))); + int p1 __attribute__((aligned (4))); + int p2 __attribute__((aligned (4))); + int p3 __attribute__((aligned (4))); + float inv_window_area __attribute__((aligned (4))); +} GpuHidHaarClassifierCascade; + +__kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCascade( + global GpuHidHaarStageClassifier * stagecascadeptr, + global int4 * info, + global GpuHidHaarTreeNode * nodeptr, + global const int * restrict sum1, + global const float * restrict sqsum1, + global int4 * candidate, + const int pixelstep, + const int loopcount, + const int start_stage, + const int split_stage, + const int end_stage, + const int startnode, + const int splitnode, + const int4 p, + const int4 pq, + const float correction) +{ + int grpszx = get_local_size(0); + int grpszy = get_local_size(1); + int grpnumx = get_num_groups(0); + int grpidx = get_group_id(0); + int lclidx = get_local_id(0); + int lclidy = get_local_id(1); + + int lcl_sz = mul24(grpszx,grpszy); + int lcl_id = mad24(lclidy,grpszx,lclidx); + + __local int lclshare[1024]; + __local int* lcldata = lclshare;//for save win data + __local int* glboutindex = lcldata + 28*28;//for save global out index + __local int* lclcount = glboutindex + 1;//for save the numuber of temp pass pixel + __local int* lcloutindex = lclcount + 1;//for save info of temp pass pixel + __local float* partialsum = (__local float*)(lcloutindex + (lcl_sz<<1)); + glboutindex[0]=0; + int outputoff = mul24(grpidx,256); + + //assume window size is 20X20 +#define WINDOWSIZE 20+1 + //make sure readwidth is the multiple of 4 + //ystep =1, from host code + int readwidth = ((grpszx-1 + WINDOWSIZE+3)>>2)<<2; + int readheight = grpszy-1+WINDOWSIZE; + int read_horiz_cnt = readwidth >> 2;//each read int4 + int total_read = mul24(read_horiz_cnt,readheight); + int read_loop = (total_read + lcl_sz - 1) >> 6; + candidate[outputoff+(lcl_id<<2)] = (int4)0; + candidate[outputoff+(lcl_id<<2)+1] = (int4)0; + candidate[outputoff+(lcl_id<<2)+2] = (int4)0; + candidate[outputoff+(lcl_id<<2)+3] = (int4)0; + for(int scalei = 0; scalei > 16; + int height = scaleinfo1.x & 0xffff; + int grpnumperline =(scaleinfo1.y & 0xffff0000) >> 16; + int totalgrp = scaleinfo1.y & 0xffff; + int imgoff = scaleinfo1.z; + float factor = as_float(scaleinfo1.w); + + __global const int * sum = sum1 + imgoff; + __global const float * sqsum = sqsum1 + imgoff; + for(int grploop=grpidx; grploop=0.f ? sqrt(variance_norm_factor) : 1.f; + + for(int stageloop = start_stage; (stageloop < split_stage) && result; stageloop++ ) + { + float stage_sum = 0.f; + int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop); + float stagethreshold = as_float(stageinfo.y); + for(int nodeloop = 0; nodeloop < stageinfo.x; ) + { + __global GpuHidHaarTreeNode* currentnodeptr = (nodeptr + nodecounter); + + int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0])); + int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0])); + int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0])); + float4 w = *(__global float4*)(&(currentnodeptr->weight[0])); + float3 alpha3 = *(__global float3*)(&(currentnodeptr->alpha[0])); + + float nodethreshold = w.w * variance_norm_factor; + + info1.x +=lcl_off; + info1.z +=lcl_off; + info2.x +=lcl_off; + info2.z +=lcl_off; + + float classsum = (lcldata[mad24(info1.y,readwidth,info1.x)] - lcldata[mad24(info1.y,readwidth,info1.z)] - + lcldata[mad24(info1.w,readwidth,info1.x)] + lcldata[mad24(info1.w,readwidth,info1.z)]) * w.x; + + classsum += (lcldata[mad24(info2.y,readwidth,info2.x)] - lcldata[mad24(info2.y,readwidth,info2.z)] - + lcldata[mad24(info2.w,readwidth,info2.x)] + lcldata[mad24(info2.w,readwidth,info2.z)]) * w.y; + + info3.x +=lcl_off; + info3.z +=lcl_off; + classsum += (lcldata[mad24(info3.y,readwidth,info3.x)] - lcldata[mad24(info3.y,readwidth,info3.z)] - + lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z; + + bool passThres = classsum >= nodethreshold; +#if STUMP_BASED + stage_sum += passThres ? alpha3.y : alpha3.x; + nodecounter++; + nodeloop++; +#else + bool isRootNode = (nodecounter & 1) == 0; + if(isRootNode) + { + if( (passThres && currentnodeptr->right) || + (!passThres && currentnodeptr->left)) + { + nodecounter ++; + } + else + { + stage_sum += alpha3.x; + nodecounter += 2; + nodeloop ++; + } + } + else + { + stage_sum += passThres ? alpha3.z : alpha3.y; + nodecounter ++; + nodeloop ++; + } +#endif + } + + result = (stage_sum >= stagethreshold); + } + + if(result && (x < width) && (y < height)) + { + int queueindex = atomic_inc(lclcount); + lcloutindex[queueindex<<1] = (lclidy << 16) | lclidx; + lcloutindex[(queueindex<<1)+1] = as_int(variance_norm_factor); + } + barrier(CLK_LOCAL_MEM_FENCE); + int queuecount = lclcount[0]; + barrier(CLK_LOCAL_MEM_FENCE); + nodecounter = splitnode; + for(int stageloop = split_stage; stageloop< end_stage && queuecount>0; stageloop++) + { + lclcount[0]=0; + barrier(CLK_LOCAL_MEM_FENCE); + + int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop); + float stagethreshold = as_float(stageinfo.y); + + int perfscale = queuecount > 4 ? 3 : 2; + int queuecount_loop = (queuecount + (1<> perfscale; + int lcl_compute_win = lcl_sz >> perfscale; + int lcl_compute_win_id = (lcl_id >>(6-perfscale)); + int lcl_loops = (stageinfo.x + lcl_compute_win -1) >> (6-perfscale); + int lcl_compute_id = lcl_id - (lcl_compute_win_id << (6-perfscale)); + for(int queueloop=0; queueloop>16),readwidth,temp_coord & 0xffff); + + if(lcl_compute_win_id < queuecount) + { + int tempnodecounter = lcl_compute_id; + float part_sum = 0.f; + const int stump_factor = STUMP_BASED ? 1 : 2; + int root_offset = 0; + for(int lcl_loop=0; lcl_loopp[0][0])); + int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0])); + int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0])); + float4 w = *(__global float4*)(&(currentnodeptr->weight[0])); + float3 alpha3 = *(__global float3*)(&(currentnodeptr->alpha[0])); + float nodethreshold = w.w * variance_norm_factor; + + info1.x +=queue_pixel; + info1.z +=queue_pixel; + info2.x +=queue_pixel; + info2.z +=queue_pixel; + + float classsum = (lcldata[mad24(info1.y,readwidth,info1.x)] - lcldata[mad24(info1.y,readwidth,info1.z)] - + lcldata[mad24(info1.w,readwidth,info1.x)] + lcldata[mad24(info1.w,readwidth,info1.z)]) * w.x; + + + classsum += (lcldata[mad24(info2.y,readwidth,info2.x)] - lcldata[mad24(info2.y,readwidth,info2.z)] - + lcldata[mad24(info2.w,readwidth,info2.x)] + lcldata[mad24(info2.w,readwidth,info2.z)]) * w.y; + + info3.x +=queue_pixel; + info3.z +=queue_pixel; + classsum += (lcldata[mad24(info3.y,readwidth,info3.x)] - lcldata[mad24(info3.y,readwidth,info3.z)] - + lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z; + + bool passThres = classsum >= nodethreshold; +#if STUMP_BASED + part_sum += passThres ? alpha3.y : alpha3.x; + tempnodecounter += lcl_compute_win; + lcl_loop++; +#else + if(root_offset == 0) + { + if( (passThres && currentnodeptr->right) || + (!passThres && currentnodeptr->left)) + { + root_offset = 1; + } + else + { + part_sum += alpha3.x; + tempnodecounter += lcl_compute_win; + lcl_loop++; + } + } + else + { + part_sum += passThres ? alpha3.z : alpha3.y; + tempnodecounter += lcl_compute_win; + lcl_loop++; + root_offset = 0; + } +#endif + }//end for(int lcl_loop=0;lcl_loop= stagethreshold && (lcl_compute_id==0)) + { + int queueindex = atomic_inc(lclcount); + lcloutindex[queueindex<<1] = temp_coord; + lcloutindex[(queueindex<<1)+1] = as_int(variance_norm_factor); + } + lcl_compute_win_id +=(1<0;stageloop++) + + if(lcl_id> 16)); + temp = glboutindex[0]; + int4 candidate_result; + candidate_result.zw = (int2)convert_int_rtn(factor*20.f); + candidate_result.x = convert_int_rtn(x*factor); + candidate_result.y = convert_int_rtn(y*factor); + atomic_inc(glboutindex); + candidate[outputoff+temp+lcl_id] = candidate_result; + } + barrier(CLK_LOCAL_MEM_FENCE); + }//end for(int grploop=grpidx;grploop> 16; + int height = scaleinfo1.x & 0xffff; + int grpnumperline = (scaleinfo1.y & 0xffff0000) >> 16; + int totalgrp = scaleinfo1.y & 0xffff; + float factor = as_float(scaleinfo1.w); + float correction_t = correction[scalei]; + int ystep = (int)(max(2.0f, factor) + 0.5f); + + for (int grploop = get_group_id(0); grploop < totalgrp; grploop += grpnumx) + { + int4 cascadeinfo = p[scalei]; + int grpidy = grploop / grpnumperline; + int grpidx = grploop - mul24(grpidy, grpnumperline); + int ix = mad24(grpidx, grpszx, lclidx); + int iy = mad24(grpidy, grpszy, lclidy); + int x = ix * ystep; + int y = iy * ystep; + lcloutindex[lcl_id] = 0; + lclcount[0] = 0; + int nodecounter; + float mean, variance_norm_factor; + //if((ix < width) && (iy < height)) + { + const int p_offset = mad24(y, step, x); + cascadeinfo.x += p_offset; + cascadeinfo.z += p_offset; + mean = (sum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.x), 0, max_idx)] + - sum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.z), 0, max_idx)] - + sum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.x), 0, max_idx)] + + sum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.z), 0, max_idx)]) + * correction_t; + variance_norm_factor = sqsum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.x), 0, max_idx)] + - sqsum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.z), 0, max_idx)] - + sqsum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.x), 0, max_idx)] + + sqsum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.z), 0, max_idx)]; + variance_norm_factor = variance_norm_factor * correction_t - mean * mean; + variance_norm_factor = variance_norm_factor >= 0.f ? sqrt(variance_norm_factor) : 1.f; + bool result = true; + nodecounter = startnode + nodecount * scalei; + for (int stageloop = start_stage; (stageloop < end_stage) && result; stageloop++) + { + float stage_sum = 0.f; + int stagecount = stagecascadeptr[stageloop].count; + for (int nodeloop = 0; nodeloop < stagecount;) + { + __global GpuHidHaarTreeNode *currentnodeptr = (nodeptr + nodecounter); + int4 info1 = *(__global int4 *)(&(currentnodeptr->p[0][0])); + int4 info2 = *(__global int4 *)(&(currentnodeptr->p[1][0])); + int4 info3 = *(__global int4 *)(&(currentnodeptr->p[2][0])); + float4 w = *(__global float4 *)(&(currentnodeptr->weight[0])); + float3 alpha3 = *(__global float3 *)(&(currentnodeptr->alpha[0])); + float nodethreshold = w.w * variance_norm_factor; + + info1.x += p_offset; + info1.z += p_offset; + info2.x += p_offset; + info2.z += p_offset; + info3.x += p_offset; + info3.z += p_offset; + float classsum = (sum[clamp(mad24(info1.y, step, info1.x), 0, max_idx)] + - sum[clamp(mad24(info1.y, step, info1.z), 0, max_idx)] - + sum[clamp(mad24(info1.w, step, info1.x), 0, max_idx)] + + sum[clamp(mad24(info1.w, step, info1.z), 0, max_idx)]) * w.x; + classsum += (sum[clamp(mad24(info2.y, step, info2.x), 0, max_idx)] + - sum[clamp(mad24(info2.y, step, info2.z), 0, max_idx)] - + sum[clamp(mad24(info2.w, step, info2.x), 0, max_idx)] + + sum[clamp(mad24(info2.w, step, info2.z), 0, max_idx)]) * w.y; + classsum += (sum[clamp(mad24(info3.y, step, info3.x), 0, max_idx)] + - sum[clamp(mad24(info3.y, step, info3.z), 0, max_idx)] - + sum[clamp(mad24(info3.w, step, info3.x), 0, max_idx)] + + sum[clamp(mad24(info3.w, step, info3.z), 0, max_idx)]) * w.z; + + bool passThres = classsum >= nodethreshold; + +#if STUMP_BASED + stage_sum += passThres ? alpha3.y : alpha3.x; + nodecounter++; + nodeloop++; +#else + bool isRootNode = (nodecounter & 1) == 0; + if(isRootNode) + { + if( (passThres && currentnodeptr->right) || + (!passThres && currentnodeptr->left)) + { + nodecounter ++; + } + else + { + stage_sum += alpha3.x; + nodecounter += 2; + nodeloop ++; + } + } + else + { + stage_sum += (passThres ? alpha3.z : alpha3.y); + nodecounter ++; + nodeloop ++; + } +#endif + } + result = (int)(stage_sum >= stagecascadeptr[stageloop].threshold); + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (result && (ix < width) && (iy < height)) + { + int queueindex = atomic_inc(lclcount); + lcloutindex[queueindex] = (y << 16) | x; + } + barrier(CLK_LOCAL_MEM_FENCE); + int queuecount = lclcount[0]; + + if (lcl_id < queuecount) + { + int temp = lcloutindex[lcl_id]; + int x = temp & 0xffff; + int y = (temp & (int)0xffff0000) >> 16; + temp = atomic_inc(glboutindex); + int4 candidate_result; + candidate_result.zw = (int2)convert_int_rtn(factor * 20.f); + candidate_result.x = x; + candidate_result.y = y; + candidate[outputoff + temp + lcl_id] = candidate_result; + } + + barrier(CLK_LOCAL_MEM_FENCE); + } + } + } +} +__kernel void gpuscaleclassifier(global GpuHidHaarTreeNode *orinode, global GpuHidHaarTreeNode *newnode, float scale, float weight_scale, int nodenum) +{ + int counter = get_global_id(0); + int tr_x[3], tr_y[3], tr_h[3], tr_w[3], i = 0; + GpuHidHaarTreeNode t1 = *(orinode + counter); +#pragma unroll + + for (i = 0; i < 3; i++) + { + tr_x[i] = (int)(t1.p[i][0] * scale + 0.5f); + tr_y[i] = (int)(t1.p[i][1] * scale + 0.5f); + tr_w[i] = (int)(t1.p[i][2] * scale + 0.5f); + tr_h[i] = (int)(t1.p[i][3] * scale + 0.5f); + } + + t1.weight[0] = t1.p[2][0] ? -(t1.weight[1] * tr_h[1] * tr_w[1] + t1.weight[2] * tr_h[2] * tr_w[2]) / (tr_h[0] * tr_w[0]) : -t1.weight[1] * tr_h[1] * tr_w[1] / (tr_h[0] * tr_w[0]); + counter += nodenum; +#pragma unroll + + for (i = 0; i < 3; i++) + { + newnode[counter].p[i][0] = tr_x[i]; + newnode[counter].p[i][1] = tr_y[i]; + newnode[counter].p[i][2] = tr_x[i] + tr_w[i]; + newnode[counter].p[i][3] = tr_y[i] + tr_h[i]; + newnode[counter].weight[i] = t1.weight[i] * weight_scale; + } + + newnode[counter].left = t1.left; + newnode[counter].right = t1.right; + newnode[counter].threshold = t1.threshold; + newnode[counter].alpha[0] = t1.alpha[0]; + newnode[counter].alpha[1] = t1.alpha[1]; + newnode[counter].alpha[2] = t1.alpha[2]; +} diff --git a/modules/objdetect/src/precomp.hpp b/modules/objdetect/src/precomp.hpp index e5157d022f..97b976baf2 100644 --- a/modules/objdetect/src/precomp.hpp +++ b/modules/objdetect/src/precomp.hpp @@ -49,6 +49,7 @@ #include "opencv2/ml.hpp" #include "opencv2/core/utility.hpp" +#include "opencv2/core/ocl.hpp" #include "opencv2/opencv_modules.hpp" #ifdef HAVE_OPENCV_HIGHGUI diff --git a/modules/ocl/include/opencv2/ocl/private/util.hpp b/modules/ocl/include/opencv2/ocl/private/util.hpp index efb684cc2a..98b734a539 100644 --- a/modules/ocl/include/opencv2/ocl/private/util.hpp +++ b/modules/ocl/include/opencv2/ocl/private/util.hpp @@ -47,6 +47,7 @@ #define __OPENCV_OCL_PRIVATE_UTIL__ #include "opencv2/ocl/cl_runtime/cl_runtime.hpp" +#include "opencv2/core/ocl_genbase.hpp" #include "opencv2/ocl.hpp" @@ -55,13 +56,6 @@ namespace cv namespace ocl { -struct ProgramEntry -{ - const char* name; - const char* programStr; - const char* programHash; -}; - inline cl_device_id getClDeviceID(const Context *ctx) { return *(cl_device_id*)(ctx->getOpenCLDeviceIDPtr()); diff --git a/modules/superres/src/btv_l1_ocl.cpp b/modules/superres/src/btv_l1_ocl.cpp index 44edc815ec..7fd6741e8d 100644 --- a/modules/superres/src/btv_l1_ocl.cpp +++ b/modules/superres/src/btv_l1_ocl.cpp @@ -64,6 +64,8 @@ using namespace cv::ocl; using namespace cv::superres; using namespace cv::superres::detail; +static ProgramEntry superres_btvl1 = cv::ocl::superres::superres_btvl1; + namespace cv { namespace ocl diff --git a/modules/superres/src/precomp.hpp b/modules/superres/src/precomp.hpp index c5dbe2db29..0681bfa28c 100644 --- a/modules/superres/src/precomp.hpp +++ b/modules/superres/src/precomp.hpp @@ -56,6 +56,7 @@ #include "opencv2/core/private.hpp" #include "opencv2/core/private.cuda.hpp" +#include "opencv2/core/ocl.hpp" #ifdef HAVE_OPENCV_CUDAARITHM # include "opencv2/cudaarithm.hpp" diff --git a/samples/cpp/ufacedetect.cpp b/samples/cpp/ufacedetect.cpp new file mode 100644 index 0000000000..01ba510b9e --- /dev/null +++ b/samples/cpp/ufacedetect.cpp @@ -0,0 +1,276 @@ +#include "opencv2/objdetect.hpp" +#include "opencv2/highgui.hpp" +#include "opencv2/imgproc.hpp" +#include "opencv2/core/utility.hpp" +#include "opencv2/core/ocl.hpp" + +#include +#include +#include +#include + +using namespace std; +using namespace cv; + +static void help() +{ + cout << "\nThis program demonstrates the cascade recognizer. Now you can use Haar or LBP features.\n" + "This classifier can recognize many kinds of rigid objects, once the appropriate classifier is trained.\n" + "It's most known use is for faces.\n" + "Usage:\n" + "./facedetect [--cascade= this is the primary trained classifier such as frontal face]\n" + " [--nested-cascade[=nested_cascade_path this an optional secondary classifier such as eyes]]\n" + " [--scale=]\n" + " [--try-flip]\n" + " [filename|camera_index]\n\n" + "see facedetect.cmd for one call:\n" + "./facedetect --cascade=\"../../data/haarcascades/haarcascade_frontalface_alt.xml\" --nested-cascade=\"../../data/haarcascades/haarcascade_eye.xml\" --scale=1.3\n\n" + "During execution:\n\tHit any key to quit.\n" + "\tUsing OpenCV version " << CV_VERSION << "\n" << endl; +} + +void detectAndDraw( UMat& img, Mat& canvas, CascadeClassifier& cascade, + CascadeClassifier& nestedCascade, + double scale, bool tryflip ); + +string cascadeName = "../../data/haarcascades/haarcascade_frontalface_alt.xml"; +string nestedCascadeName = "../../data/haarcascades/haarcascade_eye_tree_eyeglasses.xml"; + +int main( int argc, const char** argv ) +{ + VideoCapture capture; + UMat frame, image; + Mat canvas; + const string scaleOpt = "--scale="; + size_t scaleOptLen = scaleOpt.length(); + const string cascadeOpt = "--cascade="; + size_t cascadeOptLen = cascadeOpt.length(); + const string nestedCascadeOpt = "--nested-cascade"; + size_t nestedCascadeOptLen = nestedCascadeOpt.length(); + const string tryFlipOpt = "--try-flip"; + size_t tryFlipOptLen = tryFlipOpt.length(); + String inputName; + bool tryflip = false; + + help(); + + CascadeClassifier cascade, nestedCascade; + double scale = 1; + + for( int i = 1; i < argc; i++ ) + { + cout << "Processing " << i << " " << argv[i] << endl; + if( cascadeOpt.compare( 0, cascadeOptLen, argv[i], cascadeOptLen ) == 0 ) + { + cascadeName.assign( argv[i] + cascadeOptLen ); + cout << " from which we have cascadeName= " << cascadeName << endl; + } + else if( nestedCascadeOpt.compare( 0, nestedCascadeOptLen, argv[i], nestedCascadeOptLen ) == 0 ) + { + if( argv[i][nestedCascadeOpt.length()] == '=' ) + nestedCascadeName.assign( argv[i] + nestedCascadeOpt.length() + 1 ); + if( !nestedCascade.load( nestedCascadeName ) ) + cerr << "WARNING: Could not load classifier cascade for nested objects" << endl; + } + else if( scaleOpt.compare( 0, scaleOptLen, argv[i], scaleOptLen ) == 0 ) + { + if( !sscanf( argv[i] + scaleOpt.length(), "%lf", &scale ) || scale > 1 ) + scale = 1; + cout << " from which we read scale = " << scale << endl; + } + else if( tryFlipOpt.compare( 0, tryFlipOptLen, argv[i], tryFlipOptLen ) == 0 ) + { + tryflip = true; + cout << " will try to flip image horizontally to detect assymetric objects\n"; + } + else if( argv[i][0] == '-' ) + { + cerr << "WARNING: Unknown option %s" << argv[i] << endl; + } + else + inputName = argv[i]; + } + + if( !cascade.load( cascadeName ) ) + { + cerr << "ERROR: Could not load classifier cascade" << endl; + help(); + return -1; + } + + if( inputName.empty() || (isdigit(inputName.c_str()[0]) && inputName.c_str()[1] == '\0') ) + { + int c = inputName.empty() ? 0 : inputName.c_str()[0] - '0'; + if(!capture.open(c)) + cout << "Capture from camera #" << c << " didn't work" << endl; + } + else + { + if( inputName.empty() ) + inputName = "lena.jpg"; + image = imread( inputName, 1 ).getUMat(ACCESS_READ); + if( image.empty() ) + { + if(!capture.open( inputName )) + cout << "Could not read " << inputName << endl; + } + } + + namedWindow( "result", 1 ); + + if( capture.isOpened() ) + { + cout << "Video capturing has been started ..." << endl; + for(;;) + { + capture >> frame; + if( frame.empty() ) + break; + + detectAndDraw( frame, canvas, cascade, nestedCascade, scale, tryflip ); + + if( waitKey( 10 ) >= 0 ) + break; + } + } + else + { + cout << "Detecting face(s) in " << inputName << endl; + if( !image.empty() ) + { + detectAndDraw( image, canvas, cascade, nestedCascade, scale, tryflip ); + waitKey(0); + } + else if( !inputName.empty() ) + { + /* assume it is a text file containing the + list of the image filenames to be processed - one per line */ + FILE* f = fopen( inputName.c_str(), "rt" ); + if( f ) + { + char buf[1000+1]; + while( fgets( buf, 1000, f ) ) + { + int len = (int)strlen(buf), c; + while( len > 0 && isspace(buf[len-1]) ) + len--; + buf[len] = '\0'; + cout << "file " << buf << endl; + image = imread( buf, 1 ).getUMat(ACCESS_READ); + if( !image.empty() ) + { + detectAndDraw( image, canvas, cascade, nestedCascade, scale, tryflip ); + c = waitKey(0); + if( c == 27 || c == 'q' || c == 'Q' ) + break; + } + else + { + cerr << "Aw snap, couldn't read image " << buf << endl; + } + } + fclose(f); + } + } + } + + return 0; +} + +void detectAndDraw( UMat& img, Mat& canvas, CascadeClassifier& cascade, + CascadeClassifier& nestedCascade, + double scale0, bool tryflip ) +{ + int i = 0; + double t = 0, scale=1; + vector faces, faces2; + const static Scalar colors[] = + { + Scalar(0,0,255), + Scalar(0,128,255), + Scalar(0,255,255), + Scalar(0,255,0), + Scalar(255,128,0), + Scalar(255,255,0), + Scalar(255,0,0), + Scalar(255,0,255) + }; + static UMat gray, smallImg; + + t = (double)getTickCount(); + + cvtColor( img, gray, COLOR_BGR2GRAY ); + resize( gray, smallImg, Size(), scale0, scale0, INTER_LINEAR ); + cvtColor(smallImg, canvas, COLOR_GRAY2BGR); + equalizeHist( smallImg, smallImg ); + + cascade.detectMultiScale( smallImg, faces, + 1.1, 2, 0 + //|CASCADE_FIND_BIGGEST_OBJECT + //|CASCADE_DO_ROUGH_SEARCH + |CASCADE_SCALE_IMAGE + , + Size(30, 30) ); + if( tryflip ) + { + flip(smallImg, smallImg, 1); + cascade.detectMultiScale( smallImg, faces2, + 1.1, 2, 0 + //|CASCADE_FIND_BIGGEST_OBJECT + //|CASCADE_DO_ROUGH_SEARCH + |CASCADE_SCALE_IMAGE + , + Size(30, 30) ); + for( vector::const_iterator r = faces2.begin(); r != faces2.end(); r++ ) + { + faces.push_back(Rect(smallImg.cols - r->x - r->width, r->y, r->width, r->height)); + } + } + t = (double)getTickCount() - t; + cvtColor(smallImg, canvas, COLOR_GRAY2BGR); + + double fps = getTickFrequency()/t; + + putText(canvas, format("OpenCL: %s, fps: %.1f", ocl::useOpenCL() ? "ON" : "OFF", fps), Point(250, 50), + FONT_HERSHEY_SIMPLEX, 1, Scalar(0,255,0), 3); + + for( vector::const_iterator r = faces.begin(); r != faces.end(); r++, i++ ) + { + vector nestedObjects; + Point center; + Scalar color = colors[i%8]; + int radius; + + double aspect_ratio = (double)r->width/r->height; + if( 0.75 < aspect_ratio && aspect_ratio < 1.3 ) + { + center.x = cvRound((r->x + r->width*0.5)*scale); + center.y = cvRound((r->y + r->height*0.5)*scale); + radius = cvRound((r->width + r->height)*0.25*scale); + circle( canvas, center, radius, color, 3, 8, 0 ); + } + else + rectangle( canvas, Point(cvRound(r->x*scale), cvRound(r->y*scale)), + Point(cvRound((r->x + r->width-1)*scale), cvRound((r->y + r->height-1)*scale)), + color, 3, 8, 0); + if( nestedCascade.empty() ) + continue; + UMat smallImgROI = smallImg(*r); + nestedCascade.detectMultiScale( smallImgROI, nestedObjects, + 1.1, 2, 0 + //|CASCADE_FIND_BIGGEST_OBJECT + //|CASCADE_DO_ROUGH_SEARCH + //|CASCADE_DO_CANNY_PRUNING + |CASCADE_SCALE_IMAGE + , + Size(30, 30) ); + for( vector::const_iterator nr = nestedObjects.begin(); nr != nestedObjects.end(); nr++ ) + { + center.x = cvRound((r->x + nr->x + nr->width*0.5)*scale); + center.y = cvRound((r->y + nr->y + nr->height*0.5)*scale); + radius = cvRound((nr->width + nr->height)*0.25*scale); + circle( canvas, center, radius, color, 3, 8, 0 ); + } + } + imshow( "result", canvas ); +} diff --git a/samples/ocl/facedetect.cpp b/samples/ocl/facedetect.cpp index 8669719504..37861154ec 100644 --- a/samples/ocl/facedetect.cpp +++ b/samples/ocl/facedetect.cpp @@ -11,7 +11,7 @@ using namespace std; using namespace cv; -#define LOOP_NUM 10 +#define LOOP_NUM 1 const static Scalar colors[] = { CV_RGB(0,0,255), CV_RGB(0,128,255), @@ -83,7 +83,7 @@ int main( int argc, const char** argv ) } CvCapture* capture = 0; - Mat frame, frameCopy, image; + Mat frame, frameCopy0, frameCopy, image; bool useCPU = cmd.get("s"); string inputName = cmd.get("i"); @@ -129,16 +129,21 @@ int main( int argc, const char** argv ) if( frame.empty() ) break; if( iplImg->origin == IPL_ORIGIN_TL ) - frame.copyTo( frameCopy ); + frame.copyTo( frameCopy0 ); else - flip( frame, frameCopy, 0 ); + flip( frame, frameCopy0, 0 ); + if( scale == 1) + frameCopy0.copyTo(frameCopy); + else + resize(frameCopy0, frameCopy, Size(), 1./scale, 1./scale, INTER_LINEAR); + work_end = 0; if(useCPU) - detectCPU(frameCopy, faces, cpu_cascade, scale, false); + detectCPU(frameCopy, faces, cpu_cascade, 1, false); else - detect(frameCopy, faces, cascade, scale, false); + detect(frameCopy, faces, cascade, 1, false); - Draw(frameCopy, faces, scale); + Draw(frameCopy, faces, 1); if( waitKey( 10 ) >= 0 ) break; } @@ -150,6 +155,7 @@ int main( int argc, const char** argv ) vector faces; vector ref_rst; double accuracy = 0.; + work_end = 0; for(int i = 0; i <= LOOP_NUM; i ++) { cout << "loop" << i << endl; @@ -188,7 +194,7 @@ void detect( Mat& img, vector& faces, { ocl::oclMat image(img); ocl::oclMat gray, smallImg( cvRound (img.rows/scale), cvRound(img.cols/scale), CV_8UC1 ); - if(calTime) workBegin(); + workBegin(); ocl::cvtColor( image, gray, COLOR_BGR2GRAY ); ocl::resize( gray, smallImg, smallImg.size(), 0, 0, INTER_LINEAR ); ocl::equalizeHist( smallImg, smallImg ); @@ -197,14 +203,14 @@ void detect( Mat& img, vector& faces, 3, 0 |CASCADE_SCALE_IMAGE , Size(30,30), Size(0, 0) ); - if(calTime) workEnd(); + workEnd(); } void detectCPU( Mat& img, vector& faces, CascadeClassifier& cascade, double scale, bool calTime) { - if(calTime) workBegin(); + workBegin(); Mat cpu_gray, cpu_smallImg( cvRound (img.rows/scale), cvRound(img.cols/scale), CV_8UC1 ); cvtColor(img, cpu_gray, COLOR_BGR2GRAY); resize(cpu_gray, cpu_smallImg, cpu_smallImg.size(), 0, 0, INTER_LINEAR); @@ -212,13 +218,15 @@ void detectCPU( Mat& img, vector& faces, cascade.detectMultiScale(cpu_smallImg, faces, 1.1, 3, 0 | CASCADE_SCALE_IMAGE, Size(30, 30), Size(0, 0)); - if(calTime) workEnd(); + workEnd(); } void Draw(Mat& img, vector& faces, double scale) { int i = 0; + putText(img, format("fps: %.1f", 1000./getTime()), Point(450, 50), + FONT_HERSHEY_SIMPLEX, 1, Scalar(0,255,0), 3); for( vector::const_iterator r = faces.begin(); r != faces.end(); r++, i++ ) { Point center; @@ -229,7 +237,7 @@ void Draw(Mat& img, vector& faces, double scale) radius = cvRound((r->width + r->height)*0.25*scale); circle( img, center, radius, color, 3, 8, 0 ); } - imwrite( outputName, img ); + //imwrite( outputName, img ); if(abs(scale-1.0)>.001) { resize(img, img, Size((int)(img.cols/scale), (int)(img.rows/scale)));