Merge pull request #13315 from elatkin:el/gapi_perf_erdilate

GAPI (fluid): Erode/Dilate optimization (#13315)

* GAPI (fluid): Erode/Dilate optimization: hard-code 3x3 case

* GAPI (fluid): Erode/Dilate optimization: CPU dispatcher

* GAPI (fluid): Erode/Dilate optimization: speed-up 10-15x times with CV_SIMD
pull/13319/head^2
Evgeny Latkin 6 years ago committed by Alexander Alekhin
parent 6808d33b2f
commit 992d5b8bcd
  1. 65
      modules/gapi/src/backends/fluid/gfluidimgproc.cpp
  2. 22
      modules/gapi/src/backends/fluid/gfluidimgproc_func.dispatch.cpp
  3. 19
      modules/gapi/src/backends/fluid/gfluidimgproc_func.hpp
  4. 209
      modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp

@ -1164,8 +1164,6 @@ GAPI_FLUID_KERNEL(GFluidFilter2D, cv::gapi::imgproc::GFilter2D, true)
//
//-----------------------------
enum Morphology { M_ERODE, M_DILATE };
template<typename DST, typename SRC>
static void run_morphology( Buffer& dst,
const View & src,
@ -1175,6 +1173,10 @@ static void run_morphology( Buffer& dst,
const cv::Point & /* anchor */,
Morphology morphology)
{
static_assert(std::is_same<DST, SRC>::value, "unsupported combination of types");
GAPI_Assert(M_ERODE == morphology || M_DILATE == morphology);
static const int maxLines = 9;
GAPI_Assert(k_rows <= maxLines);
@ -1194,43 +1196,44 @@ static void run_morphology( Buffer& dst,
int width = dst.length();
int chan = dst.meta().chan;
for (int w=0; w < width; w++)
// call optimized code, if 3x3
if (3 == k_rows && 3 == k_cols)
{
// TODO: make this cycle innermost
for (int c=0; c < chan; c++)
run_morphology3x3_impl(out, in, width, chan, k, morphology);
return;
}
// reference: any size of k[]
int length = width * chan;
for (int l=0; l < length; l++)
{
SRC result;
if (M_ERODE == morphology)
{
SRC result=0;
if (M_ERODE == morphology)
{
result = std::numeric_limits<SRC>::max();
}
else if (M_DILATE == morphology)
{
result = std::numeric_limits<SRC>::min();
}
else
CV_Error(cv::Error::StsBadArg, "unsupported morphology operation");
result = std::numeric_limits<SRC>::max();
}
else // if (M_DILATE == morphology)
{
result = std::numeric_limits<SRC>::min();
}
for (int i=0; i < k_rows; i++)
for (int j=0; j < k_cols; j++)
for (int i=0; i < k_rows; i++)
for (int j=0; j < k_cols; j++)
{
if ( k[k_cols*i + j] )
{
if ( k[k_cols*i + j] )
if (M_ERODE == morphology)
{
result = (std::min)(result, in[i][l + (j - border_x)*chan]);
}
else // if (M_DILATE == morphology)
{
if (M_ERODE == morphology)
{
result = std::min(result, in[i][(w + j - border_x)*chan + c]);
}
else if (M_DILATE == morphology)
{
result = std::max(result, in[i][(w + j - border_x)*chan + c]);
}
else
CV_Error(cv::Error::StsBadArg, "unsupported morphology operation");
result = (std::max)(result, in[i][l + (j - border_x)*chan]);
}
}
out[w*chan + c] = saturate<DST>(result, rintf);
}
out[l] = saturate<DST>(result, rintf);
}
}

@ -111,6 +111,28 @@ RUN_FILTER2D_3X3_IMPL( float, float)
#undef RUN_FILTER2D_3X3_IMPL
//-----------------------------
//
// Fluid kernels: Erode, Dilate
//
//-----------------------------
#define RUN_MORPHOLOGY3X3_IMPL(T) \
void run_morphology3x3_impl(T out[], const T *in[], int width, int chan, \
const uchar k[], Morphology morphology) \
{ \
CV_CPU_DISPATCH(run_morphology3x3_impl, \
(out, in, width, chan, k, morphology), \
CV_CPU_DISPATCH_MODES_ALL); \
}
RUN_MORPHOLOGY3X3_IMPL(uchar )
RUN_MORPHOLOGY3X3_IMPL(ushort)
RUN_MORPHOLOGY3X3_IMPL( short)
RUN_MORPHOLOGY3X3_IMPL( float)
#undef RUN_MORPHOLOGY3X3_IMPL
} // namespace fliud
} // namespace gapi
} // namespace cv

@ -77,6 +77,25 @@ RUN_FILTER2D_3X3_IMPL( float, float)
#undef RUN_FILTER2D_3X3_IMPL
//-----------------------------
//
// Fluid kernels: Erode, Dilate
//
//-----------------------------
enum Morphology { M_ERODE, M_DILATE };
#define RUN_MORPHOLOGY3X3_IMPL(T) \
void run_morphology3x3_impl(T out[], const T *in[], int width, int chan, \
const uchar k[], Morphology morphology);
RUN_MORPHOLOGY3X3_IMPL(uchar )
RUN_MORPHOLOGY3X3_IMPL(ushort)
RUN_MORPHOLOGY3X3_IMPL( short)
RUN_MORPHOLOGY3X3_IMPL( float)
#undef RUN_MORPHOLOGY3X3_IMPL
} // namespace fluid
} // namespace gapi
} // namespace cv

@ -19,6 +19,8 @@
#include <cstdint>
#include <cstring>
#include <algorithm>
#include <limits>
#include <vector>
#ifdef __GNUC__
@ -97,6 +99,23 @@ RUN_FILTER2D_3X3_IMPL( float, float)
#undef RUN_FILTER2D_3X3_IMPL
//-----------------------------
//
// Fluid kernels: Erode, Dilate
//
//-----------------------------
#define RUN_MORPHOLOGY3X3_IMPL(T) \
void run_morphology3x3_impl(T out[], const T *in[], int width, int chan, \
const uchar k[], Morphology morphology);
RUN_MORPHOLOGY3X3_IMPL(uchar )
RUN_MORPHOLOGY3X3_IMPL(ushort)
RUN_MORPHOLOGY3X3_IMPL( short)
RUN_MORPHOLOGY3X3_IMPL( float)
#undef RUN_MORPHOLOGY3X3_IMPL
//----------------------------------------------------------------------
#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
@ -1099,6 +1118,196 @@ RUN_FILTER2D_3X3_IMPL( float, float)
#undef RUN_FILTER2D_3X3_IMPL
//-----------------------------
//
// Fluid kernels: Erode, Dilate
//
//-----------------------------
template<Morphology morphology, typename T>
static void run_morphology3x3_reference(T out[], const T *in[], int width, int chan,
const uchar k[])
{
constexpr int k_size = 3;
constexpr int border = (k_size - 1) / 2;
const uchar kernel[3][3] = {{k[0], k[1], k[2]}, {k[3], k[4], k[5]}, {k[6], k[7], k[8]}};
const int length = width * chan;
const int shift = border * chan;
for (int l=0; l < length; l++)
{
T result = M_ERODE == morphology? std::numeric_limits<T>::max():
std::numeric_limits<T>::min();
if (M_ERODE == morphology)
{
result = kernel[0][0]? (std::min)(result, in[0][l - shift]): result;
result = kernel[0][1]? (std::min)(result, in[0][l ]): result;
result = kernel[0][2]? (std::min)(result, in[0][l + shift]): result;
result = kernel[1][0]? (std::min)(result, in[1][l - shift]): result;
result = kernel[1][1]? (std::min)(result, in[1][l ]): result;
result = kernel[1][2]? (std::min)(result, in[1][l + shift]): result;
result = kernel[2][0]? (std::min)(result, in[2][l - shift]): result;
result = kernel[2][1]? (std::min)(result, in[2][l ]): result;
result = kernel[2][2]? (std::min)(result, in[2][l + shift]): result;
}
else // if (M_DILATE == morphology)
{
result = kernel[0][0]? (std::max)(result, in[0][l - shift]): result;
result = kernel[0][1]? (std::max)(result, in[0][l ]): result;
result = kernel[0][2]? (std::max)(result, in[0][l + shift]): result;
result = kernel[1][0]? (std::max)(result, in[1][l - shift]): result;
result = kernel[1][1]? (std::max)(result, in[1][l ]): result;
result = kernel[1][2]? (std::max)(result, in[1][l + shift]): result;
result = kernel[2][0]? (std::max)(result, in[2][l - shift]): result;
result = kernel[2][1]? (std::max)(result, in[2][l ]): result;
result = kernel[2][2]? (std::max)(result, in[2][l + shift]): result;
}
out[l] = result;
}
}
#if CV_SIMD
template<Morphology morphology, typename T, typename VT, typename S>
static void run_morphology3x3_simd(T out[], const T *in[], int width, int chan,
const uchar k[], S setall)
{
constexpr int k_size = 3;
constexpr int border = (k_size - 1) / 2;
const uchar kernel[3][3] = {{k[0], k[1], k[2]}, {k[3], k[4], k[5]}, {k[6], k[7], k[8]}};
const int length = width * chan;
const int shift = border * chan;
for (int l=0; l < length;)
{
constexpr int nlanes = VT::nlanes;
// main part of output row
for (; l <= length - nlanes; l += nlanes)
{
VT r = M_ERODE == morphology? setall(std::numeric_limits<T>::max()):
setall(std::numeric_limits<T>::min());
if (M_ERODE == morphology)
{
if (kernel[0][0]) r = v_min(r, vx_load(&in[0][l - shift]));
if (kernel[0][1]) r = v_min(r, vx_load(&in[0][l ]));
if (kernel[0][2]) r = v_min(r, vx_load(&in[0][l + shift]));
if (kernel[1][0]) r = v_min(r, vx_load(&in[1][l - shift]));
if (kernel[1][1]) r = v_min(r, vx_load(&in[1][l ]));
if (kernel[1][2]) r = v_min(r, vx_load(&in[1][l + shift]));
if (kernel[2][0]) r = v_min(r, vx_load(&in[2][l - shift]));
if (kernel[2][1]) r = v_min(r, vx_load(&in[2][l ]));
if (kernel[2][2]) r = v_min(r, vx_load(&in[2][l + shift]));
}
else // if (M_DILATE == morphology)
{
if (kernel[0][0]) r = v_max(r, vx_load(&in[0][l - shift]));
if (kernel[0][1]) r = v_max(r, vx_load(&in[0][l ]));
if (kernel[0][2]) r = v_max(r, vx_load(&in[0][l + shift]));
if (kernel[1][0]) r = v_max(r, vx_load(&in[1][l - shift]));
if (kernel[1][1]) r = v_max(r, vx_load(&in[1][l ]));
if (kernel[1][2]) r = v_max(r, vx_load(&in[1][l + shift]));
if (kernel[2][0]) r = v_max(r, vx_load(&in[2][l - shift]));
if (kernel[2][1]) r = v_max(r, vx_load(&in[2][l ]));
if (kernel[2][2]) r = v_max(r, vx_load(&in[2][l + shift]));
}
v_store(&out[l], r);
}
// tail (if any)
if (l < length)
{
GAPI_DbgAssert(length >= nlanes);
l = length - nlanes;
}
}
}
#endif
template<Morphology morphology, typename T>
static void run_morphology3x3_code(T out[], const T *in[], int width, int chan,
const uchar k[])
{
#if CV_SIMD
int length = width * chan;
// length variable may be unused if types do not match at 'if' statements below
(void) length;
if (std::is_same<T, float>::value && length >= v_float32::nlanes)
{
run_morphology3x3_simd<morphology, float, v_float32>(reinterpret_cast<float*>(out),
reinterpret_cast<const float**>(in),
width, chan, k, vx_setall_f32);
return;
}
if (std::is_same<T, short>::value && length >= v_int16::nlanes)
{
run_morphology3x3_simd<morphology, short, v_int16>(reinterpret_cast<short*>(out),
reinterpret_cast<const short**>(in),
width, chan, k, vx_setall_s16);
return;
}
if (std::is_same<T, ushort>::value && length >= v_uint16::nlanes)
{
run_morphology3x3_simd<morphology, ushort, v_uint16>(reinterpret_cast<ushort*>(out),
reinterpret_cast<const ushort**>(in),
width, chan, k, vx_setall_u16);
return;
}
if (std::is_same<T, uchar>::value && length >= v_uint8::nlanes)
{
run_morphology3x3_simd<morphology, uchar, v_uint8>(reinterpret_cast<uchar*>(out),
reinterpret_cast<const uchar**>(in),
width, chan, k, vx_setall_u8);
return;
}
#endif // CV_SIMD
run_morphology3x3_reference<morphology>(out, in, width, chan, k);
}
#define RUN_MORPHOLOGY3X3_IMPL(T) \
void run_morphology3x3_impl(T out[], const T *in[], int width, int chan, \
const uchar k[], Morphology morphology) \
{ \
if (M_ERODE == morphology) \
{ \
run_morphology3x3_code<M_ERODE>(out, in, width, chan, k); \
} \
else if (M_DILATE == morphology) \
{ \
run_morphology3x3_code<M_DILATE>(out, in, width, chan, k); \
} \
else \
CV_Error(cv::Error::StsBadArg, "unsupported morphology operation"); \
}
RUN_MORPHOLOGY3X3_IMPL(uchar )
RUN_MORPHOLOGY3X3_IMPL(ushort)
RUN_MORPHOLOGY3X3_IMPL( short)
RUN_MORPHOLOGY3X3_IMPL( float)
#undef RUN_MORPHOLOGY3X3_IMPL
//------------------------------------------------------------------------------
#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY

Loading…
Cancel
Save