From 7ee1d5f69bbf49e766dbb9cc2f78c1f48b60a401 Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Tue, 22 Apr 2014 19:52:37 +0400 Subject: [PATCH] ipp: added LUT optimization --- modules/core/include/opencv2/core/private.hpp | 2 + modules/core/src/convert.cpp | 237 +++++++++++++++++- modules/imgproc/src/morph.cpp | 5 +- 3 files changed, 241 insertions(+), 3 deletions(-) diff --git a/modules/core/include/opencv2/core/private.hpp b/modules/core/include/opencv2/core/private.hpp index a9210a18ba..eea2281dc2 100644 --- a/modules/core/include/opencv2/core/private.hpp +++ b/modules/core/include/opencv2/core/private.hpp @@ -218,6 +218,8 @@ CV_EXPORTS void scalarToRawData(const cv::Scalar& s, void* buf, int type, int un # endif # define IPP_VERSION_X100 (IPP_VERSION_MAJOR * 100 + IPP_VERSION_MINOR) +#define IPP_ALIGN 32 // required for AVX optimization + #define setIppErrorStatus() cv::ipp::setIppStatus(-1, CV_Func, __FILE__, __LINE__) static inline IppiSize ippiSize(int width, int height) diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp index e6154430bb..6684dedd70 100644 --- a/modules/core/src/convert.cpp +++ b/modules/core/src/convert.cpp @@ -1543,10 +1543,10 @@ static LUTFunc lutTab[] = static bool ocl_LUT(InputArray _src, InputArray _lut, OutputArray _dst) { - int dtype = _dst.type(), lcn = _lut.channels(), dcn = CV_MAT_CN(dtype), ddepth = CV_MAT_DEPTH(dtype); + int lcn = _lut.channels(), dcn = _src.channels(), ddepth = _lut.depth(); UMat src = _src.getUMat(), lut = _lut.getUMat(); - _dst.create(src.size(), dtype); + _dst.create(src.size(), CV_MAKETYPE(ddepth, dcn)); UMat dst = _dst.getUMat(); ocl::Kernel k("LUT", ocl::core::lut_oclsrc, @@ -1564,6 +1564,201 @@ static bool ocl_LUT(InputArray _src, InputArray _lut, OutputArray _dst) #endif +#if defined(HAVE_IPP) && !defined(HAVE_IPP_ICV_ONLY) +namespace ipp { + +#if 0 // there are no performance benefits (PR #2653) +class IppLUTParallelBody_LUTC1 : public ParallelLoopBody +{ +public: + bool* ok; + const Mat& src_; + const Mat& lut_; + Mat& dst_; + + typedef IppStatus (*IppFn)(const Ipp8u* pSrc, int srcStep, void* pDst, int dstStep, + IppiSize roiSize, const void* pTable, int nBitSize); + IppFn fn; + + int width; + + IppLUTParallelBody_LUTC1(const Mat& src, const Mat& lut, Mat& dst, bool* _ok) + : ok(_ok), src_(src), lut_(lut), dst_(dst) + { + width = dst.cols * dst.channels(); + + size_t elemSize1 = CV_ELEM_SIZE1(dst.depth()); + + fn = + elemSize1 == 1 ? (IppFn)ippiLUTPalette_8u_C1R : + elemSize1 == 4 ? (IppFn)ippiLUTPalette_8u32u_C1R : + NULL; + + *ok = (fn != NULL); + } + + void operator()( const cv::Range& range ) const + { + if (!*ok) + return; + + const int row0 = range.start; + const int row1 = range.end; + + Mat src = src_.rowRange(row0, row1); + Mat dst = dst_.rowRange(row0, row1); + + IppiSize sz = { width, dst.rows }; + + CV_DbgAssert(fn != NULL); + if (fn(src.data, (int)src.step[0], dst.data, (int)dst.step[0], sz, lut_.data, 8) < 0) + { + setIppErrorStatus(); + *ok = false; + } + } +private: + IppLUTParallelBody_LUTC1(const IppLUTParallelBody_LUTC1&); + IppLUTParallelBody_LUTC1& operator=(const IppLUTParallelBody_LUTC1&); +}; +#endif + +class IppLUTParallelBody_LUTCN : public ParallelLoopBody +{ +public: + bool *ok; + const Mat& src_; + const Mat& lut_; + Mat& dst_; + + int lutcn; + + uchar* lutBuffer; + uchar* lutTable[4]; + + IppLUTParallelBody_LUTCN(const Mat& src, const Mat& lut, Mat& dst, bool* _ok) + : ok(_ok), src_(src), lut_(lut), dst_(dst), lutBuffer(NULL) + { + lutcn = lut.channels(); + IppiSize sz256 = {256, 1}; + + size_t elemSize1 = dst.elemSize1(); + CV_DbgAssert(elemSize1 == 1); + lutBuffer = (uchar*)ippMalloc(256 * (int)elemSize1 * 4); + lutTable[0] = lutBuffer + 0; + lutTable[1] = lutBuffer + 1 * 256 * elemSize1; + lutTable[2] = lutBuffer + 2 * 256 * elemSize1; + lutTable[3] = lutBuffer + 3 * 256 * elemSize1; + + CV_DbgAssert(lutcn == 3 || lutcn == 4); + if (lutcn == 3) + { + IppStatus status = ippiCopy_8u_C3P3R(lut.data, (int)lut.step[0], lutTable, (int)lut.step[0], sz256); + if (status < 0) + { + setIppErrorStatus(); + return; + } + } + else if (lutcn == 4) + { + IppStatus status = ippiCopy_8u_C4P4R(lut.data, (int)lut.step[0], lutTable, (int)lut.step[0], sz256); + if (status < 0) + { + setIppErrorStatus(); + return; + } + } + + *ok = true; + } + + ~IppLUTParallelBody_LUTCN() + { + if (lutBuffer != NULL) + ippFree(lutBuffer); + lutBuffer = NULL; + lutTable[0] = NULL; + } + + void operator()( const cv::Range& range ) const + { + if (!*ok) + return; + + const int row0 = range.start; + const int row1 = range.end; + + Mat src = src_.rowRange(row0, row1); + Mat dst = dst_.rowRange(row0, row1); + + if (lutcn == 3) + { + if (ippiLUTPalette_8u_C3R( + src.data, (int)src.step[0], dst.data, (int)dst.step[0], + ippiSize(dst.size()), lutTable, 8) >= 0) + return; + } + else if (lutcn == 4) + { + if (ippiLUTPalette_8u_C4R( + src.data, (int)src.step[0], dst.data, (int)dst.step[0], + ippiSize(dst.size()), lutTable, 8) >= 0) + return; + } + setIppErrorStatus(); + *ok = false; + } +private: + IppLUTParallelBody_LUTCN(const IppLUTParallelBody_LUTCN&); + IppLUTParallelBody_LUTCN& operator=(const IppLUTParallelBody_LUTCN&); +}; +} // namespace ipp +#endif // IPP + +class LUTParallelBody : public ParallelLoopBody +{ +public: + bool* ok; + const Mat& src_; + const Mat& lut_; + Mat& dst_; + + LUTFunc func; + + LUTParallelBody(const Mat& src, const Mat& lut, Mat& dst, bool* _ok) + : ok(_ok), src_(src), lut_(lut), dst_(dst) + { + func = lutTab[lut.depth()]; + *ok = (func != NULL); + } + + void operator()( const cv::Range& range ) const + { + CV_DbgAssert(*ok); + + const int row0 = range.start; + const int row1 = range.end; + + Mat src = src_.rowRange(row0, row1); + Mat dst = dst_.rowRange(row0, row1); + + int cn = src.channels(); + int lutcn = lut_.channels(); + + const Mat* arrays[] = {&src, &dst, 0}; + uchar* ptrs[2]; + NAryMatIterator it(arrays, ptrs); + int len = (int)it.size; + + for( size_t i = 0; i < it.nplanes; i++, ++it ) + func(ptrs[0], lut_.data, ptrs[1], len, cn, lutcn); + } +private: + LUTParallelBody(const LUTParallelBody&); + LUTParallelBody& operator=(const LUTParallelBody&); +}; + } void cv::LUT( InputArray _src, InputArray _lut, OutputArray _dst ) @@ -1582,6 +1777,44 @@ void cv::LUT( InputArray _src, InputArray _lut, OutputArray _dst ) _dst.create(src.dims, src.size, CV_MAKETYPE(_lut.depth(), cn)); Mat dst = _dst.getMat(); + if (_src.dims() <= 2) + { + bool ok = false; + Ptr body; +#if defined(HAVE_IPP) && !defined(HAVE_IPP_ICV_ONLY) + size_t elemSize1 = CV_ELEM_SIZE1(dst.depth()); +#if 0 // there are no performance benefits (PR #2653) + if (lutcn == 1) + { + ParallelLoopBody* p = new ipp::IppLUTParallelBody_LUTC1(src, lut, dst, &ok); + body.reset(p); + } + else +#endif + if ((lutcn == 3 || lutcn == 4) && elemSize1 == 1) + { + ParallelLoopBody* p = new ipp::IppLUTParallelBody_LUTCN(src, lut, dst, &ok); + body.reset(p); + } +#endif + if (body == NULL || ok == false) + { + ok = false; + ParallelLoopBody* p = new LUTParallelBody(src, lut, dst, &ok); + body.reset(p); + } + if (body != NULL && ok) + { + Range all(0, dst.rows); + if (dst.total()>>18) + parallel_for_(all, *body, (double)std::max((size_t)1, dst.total()>>16)); + else + (*body)(all); + if (ok) + return; + } + } + LUTFunc func = lutTab[lut.depth()]; CV_Assert( func != 0 ); diff --git a/modules/imgproc/src/morph.cpp b/modules/imgproc/src/morph.cpp index e76f8ab4ed..4747c60f51 100644 --- a/modules/imgproc/src/morph.cpp +++ b/modules/imgproc/src/morph.cpp @@ -1258,8 +1258,11 @@ static bool IPPMorphReplicate(int op, const Mat &src, Mat &dst, const Mat &kerne default: return false; } - #undef IPP_MORPH_CASE + +#if defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ == 8 + return false; /// It disables false positive warning in GCC 4.8.2 +#endif #endif } }