From beb14c70da8a573d4157348a6800bd6e67d56d5c Mon Sep 17 00:00:00 2001 From: Anna Khakimova Date: Fri, 8 Nov 2019 11:15:40 +0300 Subject: [PATCH] GAPI Fluid: SIMD optimization for sep filters 5x5 kernel size (gaussBlur) --- .../gapi/src/backends/fluid/gfluidimgproc.cpp | 14 +- .../fluid/gfluidimgproc_func.dispatch.cpp | 22 + .../src/backends/fluid/gfluidimgproc_func.hpp | 19 + .../fluid/gfluidimgproc_func.simd.hpp | 599 +++++++++++++++++- 4 files changed, 640 insertions(+), 14 deletions(-) diff --git a/modules/gapi/src/backends/fluid/gfluidimgproc.cpp b/modules/gapi/src/backends/fluid/gfluidimgproc.cpp index ba70954476..d1445675fa 100644 --- a/modules/gapi/src/backends/fluid/gfluidimgproc.cpp +++ b/modules/gapi/src/backends/fluid/gfluidimgproc.cpp @@ -599,6 +599,7 @@ static void run_sepfilter(Buffer& dst, const View& src, { constexpr int kMax = 11; GAPI_Assert(kxLen <= kMax && kyLen <= kMax); + GAPI_Assert(kxLen == kyLen); const SRC *in[kMax]; DST *out; @@ -625,6 +626,13 @@ static void run_sepfilter(Buffer& dst, const View& src, int border = xborder; run_sepfilter3x3_impl(out, in, width, chan, kx, ky, border, scale, delta, buf, y, y0); } + else if (kxLen == 5 && kyLen == 5) + { + int y = dst.y(); + int y0 = dst.priv().writeStart(); + + run_sepfilter5x5_impl(out, in, width, chan, kx, ky, xborder, scale, delta, buf, y, y0); + } else { int length = chan * width; @@ -788,7 +796,9 @@ GAPI_FLUID_KERNEL(GFluidGaussBlur, cv::gapi::imgproc::GGaussBlur, true) Buffer& dst, Buffer& scratch) { - int kxsize = ksize.width; + GAPI_Assert(ksize.height == ksize.width); + GAPI_Assert((ksize.height == 3) || (ksize.height == 5)); + const int kxsize = ksize.width; int kysize = ksize.height; auto *kx = scratch.OutLine(); // cached kernX data @@ -801,7 +811,7 @@ GAPI_FLUID_KERNEL(GFluidGaussBlur, cv::gapi::imgproc::GGaussBlur, true) constexpr int buffSize = 5; GAPI_Assert(ksize.height <= buffSize); - float *buf[buffSize]{}; + float *buf[buffSize] = { nullptr }; buf[0] = ky + kysize; for (int i = 1; i < ksize.height; ++i) diff --git a/modules/gapi/src/backends/fluid/gfluidimgproc_func.dispatch.cpp b/modules/gapi/src/backends/fluid/gfluidimgproc_func.dispatch.cpp index 3ea4676dde..7b6dfb11f2 100644 --- a/modules/gapi/src/backends/fluid/gfluidimgproc_func.dispatch.cpp +++ b/modules/gapi/src/backends/fluid/gfluidimgproc_func.dispatch.cpp @@ -119,6 +119,28 @@ RUN_SEPFILTER3X3_IMPL( float, float) #undef RUN_SEPFILTER3X3_IMPL +#define RUN_SEPFILTER5x5_IMPL(DST, SRC) \ +void run_sepfilter5x5_impl(DST out[], const SRC *in[], int width, int chan, \ + const float kx[], const float ky[], int border, \ + float scale, float delta, \ + float *buf[], int y, int y0) \ +{ \ + CV_CPU_DISPATCH(run_sepfilter5x5_impl, \ + (out, in, width, chan, kx, ky, border, scale, delta, buf,y, y0), \ + CV_CPU_DISPATCH_MODES_ALL); \ +} + +RUN_SEPFILTER5x5_IMPL(uchar, uchar) +RUN_SEPFILTER5x5_IMPL(short, uchar) +RUN_SEPFILTER5x5_IMPL(float, uchar) +RUN_SEPFILTER5x5_IMPL(ushort, ushort) +RUN_SEPFILTER5x5_IMPL(short, ushort) +RUN_SEPFILTER5x5_IMPL(float, ushort) +RUN_SEPFILTER5x5_IMPL(short, short) +RUN_SEPFILTER5x5_IMPL(float, short) +RUN_SEPFILTER5x5_IMPL(float, float) + +#undef RUN_SEPFILTER5x5_IMPL //------------------------- // // Fluid kernels: Filter 2D diff --git a/modules/gapi/src/backends/fluid/gfluidimgproc_func.hpp b/modules/gapi/src/backends/fluid/gfluidimgproc_func.hpp index b89ccd8988..79715d1754 100644 --- a/modules/gapi/src/backends/fluid/gfluidimgproc_func.hpp +++ b/modules/gapi/src/backends/fluid/gfluidimgproc_func.hpp @@ -78,6 +78,25 @@ RUN_SEPFILTER3X3_IMPL( float, float) #undef RUN_SEPFILTER3X3_IMPL +#define RUN_SEPFILTER5x5_IMPL(DST, SRC) \ +void run_sepfilter5x5_impl(DST out[], const SRC *in[], int width, int chan, \ + const float kx[], const float ky[], int border, \ + float scale, float delta, \ + float *buf[], int y, int y0); + + +RUN_SEPFILTER5x5_IMPL(uchar, uchar) +RUN_SEPFILTER5x5_IMPL(short, uchar) +RUN_SEPFILTER5x5_IMPL(float, uchar) +RUN_SEPFILTER5x5_IMPL(ushort, ushort) +RUN_SEPFILTER5x5_IMPL(short, ushort) +RUN_SEPFILTER5x5_IMPL(float, ushort) +RUN_SEPFILTER5x5_IMPL(short, short) +RUN_SEPFILTER5x5_IMPL(float, short) +RUN_SEPFILTER5x5_IMPL(float, float) + +#undef RUN_SEPFILTER5x5_IMPL + //------------------------- // // Fluid kernels: Filter 2D diff --git a/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp b/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp index ec1c8da971..e0d10e4ecd 100644 --- a/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp +++ b/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp @@ -100,6 +100,23 @@ RUN_SEPFILTER3X3_IMPL( float, float) #undef RUN_SEPFILTER3X3_IMPL +#define RUN_SEPFILTER5x5_IMPL(DST, SRC) \ +void run_sepfilter5x5_impl(DST out[], const SRC *in[], int width, int chan, \ + const float kx[], const float ky[], int border, \ + float scale, float delta, \ + float *buf[], int y, int y0); + +RUN_SEPFILTER5x5_IMPL(uchar, uchar) +RUN_SEPFILTER5x5_IMPL(short, uchar) +RUN_SEPFILTER5x5_IMPL(float, uchar) +RUN_SEPFILTER5x5_IMPL(ushort, ushort) +RUN_SEPFILTER5x5_IMPL(short, ushort) +RUN_SEPFILTER5x5_IMPL(float, ushort) +RUN_SEPFILTER5x5_IMPL(short, short) +RUN_SEPFILTER5x5_IMPL(float, short) +RUN_SEPFILTER5x5_IMPL(float, float) + +#undef RUN_SEPFILTER5x5_IMPL //------------------------- // // Fluid kernels: Filter 2D @@ -978,11 +995,11 @@ void run_rgb2yuv422_impl(uchar out[], const uchar in[], int width) } } -//------------------------- +//----------------------------- // -// Fluid kernels: sepFilter +// Fluid kernels: sepFilter 3x3 // -//------------------------- +//----------------------------- #if CV_SIMD // this variant not using buf[] appears 15% faster than reference any-2-float code below @@ -1322,7 +1339,7 @@ static void run_sepfilter3x3_char2short(short out[], const uchar *in[], int widt } } } -#endif +#endif //USE_SEPFILTER3X3_CHAR2SHORT #endif // CV_SIMD @@ -1464,18 +1481,576 @@ void run_sepfilter3x3_impl(DST out[], const SRC *in[], int width, int chan, \ } \ } -RUN_SEPFILTER3X3_IMPL(uchar , uchar ) -RUN_SEPFILTER3X3_IMPL( short, uchar ) -RUN_SEPFILTER3X3_IMPL( float, uchar ) +RUN_SEPFILTER3X3_IMPL(uchar, uchar) +RUN_SEPFILTER3X3_IMPL(short, uchar) +RUN_SEPFILTER3X3_IMPL(float, uchar) RUN_SEPFILTER3X3_IMPL(ushort, ushort) -RUN_SEPFILTER3X3_IMPL( short, ushort) -RUN_SEPFILTER3X3_IMPL( float, ushort) -RUN_SEPFILTER3X3_IMPL( short, short) -RUN_SEPFILTER3X3_IMPL( float, short) -RUN_SEPFILTER3X3_IMPL( float, float) +RUN_SEPFILTER3X3_IMPL(short, ushort) +RUN_SEPFILTER3X3_IMPL(float, ushort) +RUN_SEPFILTER3X3_IMPL(short, short) +RUN_SEPFILTER3X3_IMPL(float, short) +RUN_SEPFILTER3X3_IMPL(float, float) #undef RUN_SEPFILTER3X3_IMPL +//----------------------------- +// +// Fluid kernels: sepFilter 5x5 +// +//----------------------------- + +#if CV_SIMD + +// this code with manually vectored rounding to uchar +template +static void run_sepfilter5x5_any2char(uchar out[], const SRC *in[], int width, int chan, + const float kx[], const float ky[], int border, + float scale, float delta, + float *buf[], int y, int y0) +{ + constexpr int kxLen = 5; + constexpr int kyLen = kxLen; + constexpr int buffSize = 5; + + int r[buffSize]; + for (int n = 0; n < buffSize; ++n) + { + r[n] = (y - y0 + n) % 5; // previous, this, next rows + } + + const int length = width * chan; + const int shift = chan; + + // horizontal pass + + int k0 = (y == y0) ? 0 : 4; + + for (int k = k0; k < kxLen; ++k) + { + const SRC *s[kxLen] = { nullptr }; + + for (int i = 0; i < kxLen; ++i) + { + // previous , this , next pixels + s[i] = in[k] + (i - border)*shift; + } + + // rely on compiler vectoring + for (int l = 0; l < length; ++l) + { + float sum = 0; + for (int j = 0; j < kxLen; ++j) + { + sum += s[j][l] * kx[j]; + } + buf[r[k]][l] = sum; + } + } + + // vertical pass + + constexpr int nlanes = v_uint8::nlanes; + + for (int l = 0; l < length;) + { + // main part of row + for (; l <= length - nlanes; l += nlanes) + { + v_float32 sum0 = vx_load(&buf[r[0]][l]) * vx_setall_f32(ky[0]); + v_float32 sum1 = vx_load(&buf[r[0]][l + nlanes / 4]) * vx_setall_f32(ky[0]); + v_float32 sum2 = vx_load(&buf[r[0]][l + 2 * nlanes / 4]) * vx_setall_f32(ky[0]); + v_float32 sum3 = vx_load(&buf[r[0]][l + 3 * nlanes / 4]) * vx_setall_f32(ky[0]); + + for (int n = 1; n < kyLen; ++n) + { + sum0 = v_fma(vx_load(&buf[r[n]][l]), vx_setall_f32(ky[n]), sum0); + sum1 = v_fma(vx_load(&buf[r[n]][l + nlanes / 4]), vx_setall_f32(ky[n]), sum1); + sum2 = v_fma(vx_load(&buf[r[n]][l + 2 * nlanes / 4]), vx_setall_f32(ky[n]), sum2); + sum3 = v_fma(vx_load(&buf[r[n]][l + 3 * nlanes / 4]), vx_setall_f32(ky[n]), sum3); + } + + if (!noscale) + { + sum0 = v_fma(sum0, vx_setall_f32(scale), vx_setall_f32(delta)); + sum1 = v_fma(sum1, vx_setall_f32(scale), vx_setall_f32(delta)); + sum2 = v_fma(sum2, vx_setall_f32(scale), vx_setall_f32(delta)); + sum3 = v_fma(sum3, vx_setall_f32(scale), vx_setall_f32(delta)); + } + + v_int32 isum0 = v_round(sum0), + isum1 = v_round(sum1), + isum2 = v_round(sum2), + isum3 = v_round(sum3); + + v_int16 ires0 = v_pack(isum0, isum1), + ires1 = v_pack(isum2, isum3); + + v_uint8 res = v_pack_u(ires0, ires1); + v_store(reinterpret_cast(&out[l]), res); + } + + // tail (if any) + if (l < length) + { + GAPI_DbgAssert(length >= nlanes); + l = length - nlanes; + } + } + return; +} + +// this variant with manually vectored rounding to short/ushort +template +static void run_sepfilter5x5_any2short(DST out[], const SRC *in[], int width, int chan, + const float kx[], const float ky[], int border, + float scale, float delta, + float *buf[], int y, int y0) +{ + constexpr int kxLen = 5; + constexpr int kyLen = kxLen; + constexpr int buffSize = 5; + + int r[buffSize]; + for (int n = 0; n < buffSize; ++n) + { + r[n] = (y - y0 + n) % 5; // previous, this, next rows + } + + const int length = width * chan; + const int shift = chan; + + // horizontal pass + + int k0 = (y == y0) ? 0 : 4; + + for (int k = k0; k < kyLen; ++k) + { + const SRC *s[kxLen] = { nullptr }; + + for (int i = 0; i < kxLen; ++i) + { + // previous , this , next pixels + s[i] = in[k] + (i - border)*shift; + } + + // rely on compiler vectoring + for (int l = 0; l < length; ++l) + { + float sum = 0; + for (int j = 0; j < kxLen; ++j) + { + sum += s[j][l] * kx[j]; + } + buf[r[k]][l] = sum; + } + } + + // vertical pass + + constexpr int nlanes = v_int16::nlanes; + for (int l = 0; l < length;) + { + //GAPI_Assert(length >= nlanes); + // main part of row + for (; l <= length - nlanes; l += nlanes) + { + v_float32 sum0 = vx_load(&buf[r[0]][l]) * vx_setall_f32(ky[0]); + v_float32 sum1 = vx_load(&buf[r[0]][l + nlanes / 2]) * vx_setall_f32(ky[0]); + + for (int j = 1; j < kyLen; ++j) + { + sum0 = v_fma(vx_load(&buf[r[j]][l]), vx_setall_f32(ky[j]), sum0); + sum1 = v_fma(vx_load(&buf[r[j]][l + nlanes / 2]), vx_setall_f32(ky[j]), sum1); + } + + if (!noscale) + { + sum0 = v_fma(sum0, vx_setall_f32(scale), vx_setall_f32(delta)); + sum1 = v_fma(sum1, vx_setall_f32(scale), vx_setall_f32(delta)); + } + + v_int32 isum0 = v_round(sum0), + isum1 = v_round(sum1); + + if (std::is_same::value) + { + // signed short + v_int16 res = v_pack(isum0, isum1); + v_store(reinterpret_cast(&out[l]), res); + } + else + { + // unsigned short + v_uint16 res = v_pack_u(isum0, isum1); + v_store(reinterpret_cast(&out[l]), res); + } + } + + // tail (if any) + if (l < length) + { + GAPI_DbgAssert(length >= nlanes); + l = length - nlanes; + } + } + return; +} + +// this variant not using buf[] +template +static void run_sepfilter5x5_any2float(float out[], const SRC *in[], int width, int chan, + const float kx[], const float ky[], int border, + float scale, float delta) +{ + constexpr int kxLen = 5; + constexpr int kyLen = kxLen; + constexpr int buffSize = 5; + + const int length = width * chan; + const int shift = chan; + + static const int nlanes = v_float32::nlanes; + for (int l = 0; l < length; ) + { + //GAPI_Assert(length >= nlanes); + // main part + for (; l <= length - nlanes; l += nlanes) + { + auto xsum = [l, border, shift, kx](const SRC inp[]) + { + v_float32 t[5]; + for (int i = 0; i < 5; ++i) + { + t[i] = vx_load_f32(&inp[l + (i - border)*shift]); + } + + v_float32 sum = t[0] * vx_setall_f32(kx[0]); + for (int j = 1; j < 5; ++j) + { + sum = v_fma(t[j], vx_setall_f32(kx[j]), sum); + } + + return sum; + }; + + v_float32 s[buffSize]; + for (int m = 0; m < buffSize; ++m) + { + s[m] = xsum(in[m]); + } + + v_float32 sum = s[0] * vx_setall_f32(ky[0]); + for (int n = 1; n < kyLen; ++n) + { + sum = v_fma(s[n], vx_setall_f32(ky[n]), sum); + } + + if (!noscale) + { + sum = v_fma(sum, vx_setall_f32(scale), vx_setall_f32(delta)); + } + + v_store(&out[l], sum); + } + + // tail (if any) + if (l < length) + { + GAPI_DbgAssert(length >= nlanes); + l = length - nlanes; + } + } + return; +} + +#define USE_SEPFILTER5X5_CHAR2SHORT 1 + +#if USE_SEPFILTER5X5_CHAR2SHORT +template +static void run_sepfilter5x5_char2short(short out[], const uchar *in[], int width, int chan, + const float kx[], const float ky[], int border, + float scale, float delta, + float *buf[], int y, int y0) +{ + constexpr int kxLen = 5; + constexpr int kyLen = kxLen; + + constexpr int buffSize = 5; + + schar ikx[kxLen]; + schar iky[kyLen]; + + for (int i = 0; i < kxLen; ++i) + { + ikx[i] = saturate(kx[i], rintf); + iky[i] = saturate(ky[i], rintf); + } + + const short iscale = saturate(scale * (1 << 15), rintf); + const short idelta = saturate(delta, rintf); + + // check if this code is applicable + if (ikx[0] != kx[0] || ikx[1] != kx[1] || ikx[2] != kx[2] || ikx[3] != kx[3] || ikx[4] != kx[4] || + iky[0] != ky[0] || iky[1] != ky[1] || iky[2] != ky[2] || iky[3] != ky[3] || iky[4] != ky[4] || + idelta != delta || + std::abs(scale) > 1 || std::abs(scale) < 0.01) + { + run_sepfilter5x5_any2short(out, in, width, chan, kx, ky, border, scale, delta, + buf, y, y0); + return; + } + + short *ibuf[buffSize]; + int r[buffSize]; + + for (int n = 0; n < buffSize; ++n) + { + ibuf[n] = reinterpret_cast(buf[n]); + r[n] = (y - y0 + n) % 5; // previous, this, next rows + } + + const int length = width * chan; + const int shift = chan; + + // horizontal pass + // full horizontal pass is needed only if the very 1st row in ROI is handled; + // for 2nd and further rows, it's enough to convolve only the + // "next" row - as we can reuse buffers from previous calls to + // this kernel (Fluid does rows consequently: y=y0, y0+1, ...) + int k0 = (y == y0) ? 0 : 4; + + constexpr int nlanes = v_int16::nlanes; + + for (int k = k0; k < kyLen; ++k) + { + for (int l = 0; l < length;) + { + GAPI_Assert(length >= nlanes); + + // main part of output row + for (; l <= length - nlanes; l += nlanes) + { + v_uint16 t[kxLen]; + v_int16 sum; + + for (int i = 0; i < kxLen; ++i) + { + // previous, current, next pixels + t[i] = vx_load_expand(&in[k][l + (i - border)*shift]); + + sum += v_reinterpret_as_s16(t[i]) * vx_setall_s16(ikx[i]); + } + + v_store(&ibuf[r[k]][l], sum); + } + + // tail (if any) + if (l < length) + { + GAPI_DbgAssert(length >= nlanes); + l = length - nlanes; + } + } + } + + // vertical pass + + for (int l = 0; l < length;) + { + //GAPI_Assert(length >= nlanes); + // main part of output row + for (; l <= length - nlanes; l += nlanes) + { + v_int16 s[buffSize]; + v_int16 sum; + + for (int i = 0; i < kyLen; ++i) + { + // previous, current, next rows + s[i] = vx_load(&ibuf[r[i]][l]); + + sum += s[i] * vx_setall_s16(iky[i]); + } + + if (!noscale) + { + sum = v_mul_hi(sum << 1, vx_setall_s16(iscale)) + vx_setall_s16(idelta); + } + + v_store(&out[l], sum); + } + + // tail (if any) + if (l < length) + { + GAPI_DbgAssert(length >= nlanes); + l = length - nlanes; + } + } + return; +} +#endif //USE_SEPFILTER5X5_CHAR2SHORT + +#endif //CV_SIMD + +template +static void run_sepfilter5x5_reference(DST out[], const SRC *in[], int width, int chan, + const float kx[], const float ky[], int border, + float scale, float delta, float *buf[], int y, int y0) +{ + constexpr int kxLen = 5; // kernel size + constexpr int kyLen = kxLen; + int r[kyLen]; + for (int n = 0; n < kyLen; ++n) + { + r[n] = (y - y0 + n) % 5; // previous, this, next rows + } + + int length = width * chan; + int shift = chan; + + // horizontal pass + + // full horizontal pass is needed only if very 1st row in ROI; + // for 2nd and further rows, it is enough to convolve only the + // "next" row - as we can reuse buffers from previous calls to + // this kernel (Fluid does rows consequently: y=y0, y0+1, ...) + + int k0 = (y == y0) ? 0 : 4; + + for (int k = k0; k < kyLen; ++k) + { + const SRC *s[kxLen] = { nullptr }; + + for (int i = 0; i < kxLen; ++i) + { + // previous , this , next pixels + s[i] = in[k] + (i - border)*shift; + } + + // rely on compiler vectoring + for (int l = 0; l < length; ++l) + { + float sum = 0; + for (int i = 0; i < kxLen; ++i) + { + sum += s[i][l] * kx[i]; + } + buf[r[k]][l] = sum; + } + } + + // vertical pass + + for (int l = 0; l < length; ++l) + { + float sum = 0; + for (int j = 0; j < kyLen; ++j) + { + sum += buf[r[j]][l] * ky[j]; + } + + if (!noscale) + { + sum = sum * scale + delta; + } + + out[l] = saturate(sum, rintf); + } + return; +} + +template +static void run_sepfilter5x5_code(DST out[], const SRC *in[], int width, int chan, + const float kx[], const float ky[], int border, + float scale, float delta, float *buf[], int y, int y0) +{ +#if CV_SIMD + int length = width * chan; + + // length variable may be unused if types do not match at 'if' statements below + (void)length; + + if (std::is_same::value && std::is_same::value && + length >= v_int16::nlanes) + { + run_sepfilter5x5_char2short(reinterpret_cast(out), + reinterpret_cast(in), + width, chan, kx, ky, border, scale, delta, + buf, y, y0); + return; + } + + if (std::is_same::value && std::is_same::value && + length >= v_float32::nlanes) + { + run_sepfilter5x5_any2float(reinterpret_cast(out), in, width, + chan, kx, ky, border, scale, delta); + return; + } + + if (std::is_same::value && length >= v_int16::nlanes) + { + run_sepfilter5x5_any2short(reinterpret_cast(out), in, width, + chan, kx, ky, border, scale, delta, + buf, y, y0); + return; + } + + if (std::is_same::value && length >= v_uint16::nlanes) + { + run_sepfilter5x5_any2short(reinterpret_cast(out), in, width, + chan, kx, ky, border, scale, delta, + buf, y, y0); + return; + } + + if (std::is_same::value && length >= v_uint8::nlanes) + { + run_sepfilter5x5_any2char(reinterpret_cast(out), in, width, + chan, kx, ky, border, scale, delta, + buf, y, y0); + return; + } +#endif // CV_SIMD + + // reference code is quite fast for any-to-float case, + // but not for any-to-integral due to very slow rounding + run_sepfilter5x5_reference(out, in, width, chan, kx, ky, border, + scale, delta, buf, y, y0); +} +#define RUN_SEPFILTER5x5_IMPL(DST, SRC) \ +void run_sepfilter5x5_impl(DST out[], const SRC *in[], int width, int chan, const float kx[], \ + const float ky[], int border, float scale, float delta, \ + float *buf[], int y, int y0) \ +{ \ + if (scale == 1 && delta == 0) \ + { \ + constexpr bool noscale = true; \ + run_sepfilter5x5_code(out, in, width, chan, kx, ky, border, \ + scale, delta, buf, y, y0); \ + } \ + else \ + { \ + constexpr bool noscale = false; \ + run_sepfilter5x5_code(out, in, width, chan, kx, ky, border, \ + scale, delta, buf, y, y0); \ + } \ + return; \ +} + +RUN_SEPFILTER5x5_IMPL(uchar, uchar) +RUN_SEPFILTER5x5_IMPL(short, uchar) +RUN_SEPFILTER5x5_IMPL(float, uchar) +RUN_SEPFILTER5x5_IMPL(ushort, ushort) +RUN_SEPFILTER5x5_IMPL(short, ushort) +RUN_SEPFILTER5x5_IMPL(float, ushort) +RUN_SEPFILTER5x5_IMPL(short, short) +RUN_SEPFILTER5x5_IMPL(float, short) +RUN_SEPFILTER5x5_IMPL(float, float) + +#undef RUN_SEPFILTER5x5_IMPL + //------------------------- // // Fluid kernels: Filter 2D