ipp: added LUT optimization

pull/2653/head
Alexander Alekhin 11 years ago
parent 8114e071c2
commit 7ee1d5f69b
  1. 2
      modules/core/include/opencv2/core/private.hpp
  2. 237
      modules/core/src/convert.cpp
  3. 5
      modules/imgproc/src/morph.cpp

@ -218,6 +218,8 @@ CV_EXPORTS void scalarToRawData(const cv::Scalar& s, void* buf, int type, int un
# endif
# define IPP_VERSION_X100 (IPP_VERSION_MAJOR * 100 + IPP_VERSION_MINOR)
#define IPP_ALIGN 32 // required for AVX optimization
#define setIppErrorStatus() cv::ipp::setIppStatus(-1, CV_Func, __FILE__, __LINE__)
static inline IppiSize ippiSize(int width, int height)

@ -1543,10 +1543,10 @@ static LUTFunc lutTab[] =
static bool ocl_LUT(InputArray _src, InputArray _lut, OutputArray _dst)
{
int dtype = _dst.type(), lcn = _lut.channels(), dcn = CV_MAT_CN(dtype), ddepth = CV_MAT_DEPTH(dtype);
int lcn = _lut.channels(), dcn = _src.channels(), ddepth = _lut.depth();
UMat src = _src.getUMat(), lut = _lut.getUMat();
_dst.create(src.size(), dtype);
_dst.create(src.size(), CV_MAKETYPE(ddepth, dcn));
UMat dst = _dst.getUMat();
ocl::Kernel k("LUT", ocl::core::lut_oclsrc,
@ -1564,6 +1564,201 @@ static bool ocl_LUT(InputArray _src, InputArray _lut, OutputArray _dst)
#endif
#if defined(HAVE_IPP) && !defined(HAVE_IPP_ICV_ONLY)
namespace ipp {
#if 0 // there are no performance benefits (PR #2653)
class IppLUTParallelBody_LUTC1 : public ParallelLoopBody
{
public:
bool* ok;
const Mat& src_;
const Mat& lut_;
Mat& dst_;
typedef IppStatus (*IppFn)(const Ipp8u* pSrc, int srcStep, void* pDst, int dstStep,
IppiSize roiSize, const void* pTable, int nBitSize);
IppFn fn;
int width;
IppLUTParallelBody_LUTC1(const Mat& src, const Mat& lut, Mat& dst, bool* _ok)
: ok(_ok), src_(src), lut_(lut), dst_(dst)
{
width = dst.cols * dst.channels();
size_t elemSize1 = CV_ELEM_SIZE1(dst.depth());
fn =
elemSize1 == 1 ? (IppFn)ippiLUTPalette_8u_C1R :
elemSize1 == 4 ? (IppFn)ippiLUTPalette_8u32u_C1R :
NULL;
*ok = (fn != NULL);
}
void operator()( const cv::Range& range ) const
{
if (!*ok)
return;
const int row0 = range.start;
const int row1 = range.end;
Mat src = src_.rowRange(row0, row1);
Mat dst = dst_.rowRange(row0, row1);
IppiSize sz = { width, dst.rows };
CV_DbgAssert(fn != NULL);
if (fn(src.data, (int)src.step[0], dst.data, (int)dst.step[0], sz, lut_.data, 8) < 0)
{
setIppErrorStatus();
*ok = false;
}
}
private:
IppLUTParallelBody_LUTC1(const IppLUTParallelBody_LUTC1&);
IppLUTParallelBody_LUTC1& operator=(const IppLUTParallelBody_LUTC1&);
};
#endif
class IppLUTParallelBody_LUTCN : public ParallelLoopBody
{
public:
bool *ok;
const Mat& src_;
const Mat& lut_;
Mat& dst_;
int lutcn;
uchar* lutBuffer;
uchar* lutTable[4];
IppLUTParallelBody_LUTCN(const Mat& src, const Mat& lut, Mat& dst, bool* _ok)
: ok(_ok), src_(src), lut_(lut), dst_(dst), lutBuffer(NULL)
{
lutcn = lut.channels();
IppiSize sz256 = {256, 1};
size_t elemSize1 = dst.elemSize1();
CV_DbgAssert(elemSize1 == 1);
lutBuffer = (uchar*)ippMalloc(256 * (int)elemSize1 * 4);
lutTable[0] = lutBuffer + 0;
lutTable[1] = lutBuffer + 1 * 256 * elemSize1;
lutTable[2] = lutBuffer + 2 * 256 * elemSize1;
lutTable[3] = lutBuffer + 3 * 256 * elemSize1;
CV_DbgAssert(lutcn == 3 || lutcn == 4);
if (lutcn == 3)
{
IppStatus status = ippiCopy_8u_C3P3R(lut.data, (int)lut.step[0], lutTable, (int)lut.step[0], sz256);
if (status < 0)
{
setIppErrorStatus();
return;
}
}
else if (lutcn == 4)
{
IppStatus status = ippiCopy_8u_C4P4R(lut.data, (int)lut.step[0], lutTable, (int)lut.step[0], sz256);
if (status < 0)
{
setIppErrorStatus();
return;
}
}
*ok = true;
}
~IppLUTParallelBody_LUTCN()
{
if (lutBuffer != NULL)
ippFree(lutBuffer);
lutBuffer = NULL;
lutTable[0] = NULL;
}
void operator()( const cv::Range& range ) const
{
if (!*ok)
return;
const int row0 = range.start;
const int row1 = range.end;
Mat src = src_.rowRange(row0, row1);
Mat dst = dst_.rowRange(row0, row1);
if (lutcn == 3)
{
if (ippiLUTPalette_8u_C3R(
src.data, (int)src.step[0], dst.data, (int)dst.step[0],
ippiSize(dst.size()), lutTable, 8) >= 0)
return;
}
else if (lutcn == 4)
{
if (ippiLUTPalette_8u_C4R(
src.data, (int)src.step[0], dst.data, (int)dst.step[0],
ippiSize(dst.size()), lutTable, 8) >= 0)
return;
}
setIppErrorStatus();
*ok = false;
}
private:
IppLUTParallelBody_LUTCN(const IppLUTParallelBody_LUTCN&);
IppLUTParallelBody_LUTCN& operator=(const IppLUTParallelBody_LUTCN&);
};
} // namespace ipp
#endif // IPP
class LUTParallelBody : public ParallelLoopBody
{
public:
bool* ok;
const Mat& src_;
const Mat& lut_;
Mat& dst_;
LUTFunc func;
LUTParallelBody(const Mat& src, const Mat& lut, Mat& dst, bool* _ok)
: ok(_ok), src_(src), lut_(lut), dst_(dst)
{
func = lutTab[lut.depth()];
*ok = (func != NULL);
}
void operator()( const cv::Range& range ) const
{
CV_DbgAssert(*ok);
const int row0 = range.start;
const int row1 = range.end;
Mat src = src_.rowRange(row0, row1);
Mat dst = dst_.rowRange(row0, row1);
int cn = src.channels();
int lutcn = lut_.channels();
const Mat* arrays[] = {&src, &dst, 0};
uchar* ptrs[2];
NAryMatIterator it(arrays, ptrs);
int len = (int)it.size;
for( size_t i = 0; i < it.nplanes; i++, ++it )
func(ptrs[0], lut_.data, ptrs[1], len, cn, lutcn);
}
private:
LUTParallelBody(const LUTParallelBody&);
LUTParallelBody& operator=(const LUTParallelBody&);
};
}
void cv::LUT( InputArray _src, InputArray _lut, OutputArray _dst )
@ -1582,6 +1777,44 @@ void cv::LUT( InputArray _src, InputArray _lut, OutputArray _dst )
_dst.create(src.dims, src.size, CV_MAKETYPE(_lut.depth(), cn));
Mat dst = _dst.getMat();
if (_src.dims() <= 2)
{
bool ok = false;
Ptr<ParallelLoopBody> body;
#if defined(HAVE_IPP) && !defined(HAVE_IPP_ICV_ONLY)
size_t elemSize1 = CV_ELEM_SIZE1(dst.depth());
#if 0 // there are no performance benefits (PR #2653)
if (lutcn == 1)
{
ParallelLoopBody* p = new ipp::IppLUTParallelBody_LUTC1(src, lut, dst, &ok);
body.reset(p);
}
else
#endif
if ((lutcn == 3 || lutcn == 4) && elemSize1 == 1)
{
ParallelLoopBody* p = new ipp::IppLUTParallelBody_LUTCN(src, lut, dst, &ok);
body.reset(p);
}
#endif
if (body == NULL || ok == false)
{
ok = false;
ParallelLoopBody* p = new LUTParallelBody(src, lut, dst, &ok);
body.reset(p);
}
if (body != NULL && ok)
{
Range all(0, dst.rows);
if (dst.total()>>18)
parallel_for_(all, *body, (double)std::max((size_t)1, dst.total()>>16));
else
(*body)(all);
if (ok)
return;
}
}
LUTFunc func = lutTab[lut.depth()];
CV_Assert( func != 0 );

@ -1258,8 +1258,11 @@ static bool IPPMorphReplicate(int op, const Mat &src, Mat &dst, const Mat &kerne
default:
return false;
}
#undef IPP_MORPH_CASE
#if defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ == 8
return false; /// It disables false positive warning in GCC 4.8.2
#endif
#endif
}
}

Loading…
Cancel
Save