Merge pull request #20664 from anna-khakimova:ak/resize_simd

Fluid: SIMD for Resize Linear 8UC3 * SIMD for fluid::Resize 8U3C * Rework horizontal pass + add 8U4C case * Reproduce stackoverflow test * StackOverflow test * SSE42 impl * SSE42 impl improvement * GAPI:SSE42 simd opt for Resize 8UC3. Final version * Fix tests * Conditional compilation fix * Applied comments * Applied comments. Step2 * Applied comments. Step2
3 years ago · 3cfca01372
parent d934bb15b0
commit 3cfca01372
11 changed files with 1141 additions and 100 deletions
--- a/modules/gapi/include/opencv2/gapi/core.hpp
+++ b/modules/gapi/include/opencv2/gapi/core.hpp
@ -398,7 +398,7 @@ namespace core {
    };

    G_TYPED_KERNEL(GResize, <GMat(GMat,Size,double,double,int)>, "org.opencv.core.transform.resize") {
-        static GMatDesc outMeta(GMatDesc in, Size sz, double fx, double fy, int) {
+        static GMatDesc outMeta(GMatDesc in, Size sz, double fx, double fy, int /*interp*/) {
            if (sz.width != 0 && sz.height != 0)
            {
                return in.withSize(sz);
--- a/modules/gapi/perf/common/gapi_core_perf_tests.hpp
+++ b/modules/gapi/perf/common/gapi_core_perf_tests.hpp
@ -81,7 +81,9 @@ namespace opencv_test
                                                         cv::GCompileArgs>> {};
    class TransposePerfTest : public TestPerfParams<tuple<compare_f, cv::Size, MatType, cv::GCompileArgs>> {};
    class ResizePerfTest : public TestPerfParams<tuple<compare_f, MatType, int, cv::Size, cv::Size, cv::GCompileArgs>> {};
+    class BottleneckKernelsConstInputPerfTest : public TestPerfParams<tuple<compare_f, std::string, cv::GCompileArgs>> {};
    class ResizeFxFyPerfTest : public TestPerfParams<tuple<compare_f, MatType, int, cv::Size, double, double, cv::GCompileArgs>> {};
+    class ResizeInSimpleGraphPerfTest : public TestPerfParams<tuple<compare_f, MatType, cv::Size, cv::GCompileArgs>> {};
    class ParseSSDBLPerfTest : public TestPerfParams<tuple<cv::Size, float, int, cv::GCompileArgs>>, public ParserSSDTest {};
    class ParseSSDPerfTest   : public TestPerfParams<tuple<cv::Size, float, bool, bool, cv::GCompileArgs>>, public ParserSSDTest {};
    class ParseYoloPerfTest  : public TestPerfParams<tuple<cv::Size, float, float, int, cv::GCompileArgs>>, public ParserYoloTest {};
--- a/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp
+++ b/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp
@ -2151,6 +2151,89 @@ PERF_TEST_P_(ResizeFxFyPerfTest, TestPerformance)
    {
        cc(gin(in_mat1), gout(out_mat_gapi));
    }
+    // Comparison ////////////////////////////////////////////////////////////
+    {
+        EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+//------------------------------------------------------------------------------
+
+// This test cases were created to control performance result of test scenario mentioned here:
+// https://stackoverflow.com/questions/60629331/opencv-gapi-performance-not-good-as-expected
+
+PERF_TEST_P_(BottleneckKernelsConstInputPerfTest, TestPerformance)
+{
+    compare_f cmpF = get<0>(GetParam());
+    std::string fileName = get<1>(GetParam());
+    cv::GCompileArgs compile_args = get<2>(GetParam());
+
+    in_mat1 = cv::imread(findDataFile(fileName));
+
+    cv::Mat cvvga;
+    cv::Mat cvgray;
+    cv::Mat cvblurred;
+
+    cv::resize(in_mat1, cvvga, cv::Size(), 0.5, 0.5);
+    cv::cvtColor(cvvga, cvgray, cv::COLOR_BGR2GRAY);
+    cv::blur(cvgray, cvblurred, cv::Size(3, 3));
+    cv::Canny(cvblurred, out_mat_ocv, 32, 128, 3);
+
+    cv::GMat in;
+    cv::GMat vga = cv::gapi::resize(in, cv::Size(), 0.5, 0.5, INTER_LINEAR);
+    cv::GMat gray = cv::gapi::BGR2Gray(vga);
+    cv::GMat blurred = cv::gapi::blur(gray, cv::Size(3, 3));
+    cv::GMat out = cv::gapi::Canny(blurred, 32, 128, 3);
+    cv::GComputation ac(in, out);
+
+    auto cc = ac.compile(descr_of(gin(in_mat1)),
+        std::move(compile_args));
+    cc(gin(in_mat1), gout(out_mat_gapi));
+
+    TEST_CYCLE()
+    {
+        cc(gin(in_mat1), gout(out_mat_gapi));
+    }
+
+    // Comparison ////////////////////////////////////////////////////////////
+    {
+        EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(ResizeInSimpleGraphPerfTest, TestPerformance)
+{
+    compare_f cmpF = get<0>(GetParam());
+    MatType type = get<1>(GetParam());
+    cv::Size sz_in = get<2>(GetParam());
+    cv::GCompileArgs compile_args = get<3>(GetParam());
+
+    initMatsRandU(type, sz_in, type, false);
+
+    cv::Mat add_res_ocv;
+
+    cv::add(in_mat1, in_mat2, add_res_ocv);
+    cv::resize(add_res_ocv, out_mat_ocv, cv::Size(), 0.5, 0.5);
+
+    cv::GMat in1, in2;
+    cv::GMat add_res_gapi = cv::gapi::add(in1, in2);
+    cv::GMat out = cv::gapi::resize(add_res_gapi, cv::Size(), 0.5, 0.5, INTER_LINEAR);
+    cv::GComputation ac(GIn(in1, in2), GOut(out));
+
+    auto cc = ac.compile(descr_of(gin(in_mat1, in_mat2)),
+                         std::move(compile_args));
+    cc(gin(in_mat1, in_mat2), gout(out_mat_gapi));
+
+    TEST_CYCLE()
+    {
+        cc(gin(in_mat1, in_mat2), gout(out_mat_gapi));
+    }

    // Comparison ////////////////////////////////////////////////////////////
    {
--- a/modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp
+++ b/modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp
@ -321,16 +321,28 @@ INSTANTIATE_TEST_CASE_P(TransposePerfTestCPU, TransposePerfTest,

 INSTANTIATE_TEST_CASE_P(ResizePerfTestCPU, ResizePerfTest,
    Combine(Values(AbsExact().to_compare_f()),
-        Values(CV_8UC1, CV_16UC1, CV_16SC1),
+        Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1),
        Values(cv::INTER_NEAREST, cv::INTER_LINEAR, cv::INTER_AREA),
        Values(szSmall128, szVGA, sz720p, sz1080p),
        Values(cv::Size(64, 64),
-            cv::Size(30, 30)),
+               cv::Size(32, 32)),
+        Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(BottleneckKernelsPerfTestCPU, BottleneckKernelsConstInputPerfTest,
+    Combine(Values(AbsExact().to_compare_f()),
+       Values("cv/optflow/frames/1080p_00.png", "cv/optflow/frames/720p_00.png",
+              "cv/optflow/frames/VGA_00.png", "cv/dnn_face/recognition/Aaron_Tippin_0001.jpg"),
+       Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(ResizeInSimpleGraphPerfTestCPU, ResizeInSimpleGraphPerfTest,
+    Combine(Values(AbsExact().to_compare_f()),
+        Values(CV_8UC3),
+        Values(szSmall128, szVGA, sz720p, sz1080p),
        Values(cv::compile_args(CORE_CPU))));

 INSTANTIATE_TEST_CASE_P(ResizeFxFyPerfTestCPU, ResizeFxFyPerfTest,
    Combine(Values(AbsExact().to_compare_f()),
-        Values(CV_8UC1, CV_16UC1, CV_16SC1),
+        Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1),
        Values(cv::INTER_NEAREST, cv::INTER_LINEAR, cv::INTER_AREA),
        Values(szSmall128, szVGA, sz720p, sz1080p),
        Values(0.5, 0.1),
--- a/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp
+++ b/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp
@ -277,18 +277,31 @@ INSTANTIATE_TEST_CASE_P(ConvertToPerfTestFluid, ConvertToPerfTest,
            Values(cv::compile_args(CORE_FLUID))));

 INSTANTIATE_TEST_CASE_P(ResizePerfTestFluid, ResizePerfTest,
-    Combine(Values(AbsExact().to_compare_f()),
-        Values(CV_8UC3/*CV_8UC1, CV_16UC1, CV_16SC1*/),
-        Values(/*cv::INTER_NEAREST,*/ cv::INTER_LINEAR/*, cv::INTER_AREA*/),
+    Combine(Values(Tolerance_FloatRel_IntAbs(1e-5, 1).to_compare_f()),
+        Values(CV_8UC3),
+        Values(cv::INTER_LINEAR),
        Values(szSmall128, szVGA, sz720p, sz1080p),
        Values(cv::Size(64, 64),
               cv::Size(30, 30)),
        Values(cv::compile_args(CORE_FLUID))));

+#define IMGPROC_FLUID cv::gapi::imgproc::fluid::kernels()
+INSTANTIATE_TEST_CASE_P(BottleneckKernelsPerfTestFluid, BottleneckKernelsConstInputPerfTest,
+    Combine(Values(AbsSimilarPoints(0, 1).to_compare_f()),
+        Values("cv/optflow/frames/1080p_00.png", "cv/optflow/frames/720p_00.png",
+               "cv/optflow/frames/VGA_00.png", "cv/dnn_face/recognition/Aaron_Tippin_0001.jpg"),
+        Values(cv::compile_args(CORE_FLUID, IMGPROC_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(ResizeInSimpleGraphPerfTestFluid, ResizeInSimpleGraphPerfTest,
+    Combine(Values(Tolerance_FloatRel_IntAbs(1e-5, 1).to_compare_f()),
+        Values(CV_8UC3),
+        Values(szSmall128, szVGA, sz720p, sz1080p),
+        Values(cv::compile_args(CORE_FLUID, IMGPROC_FLUID))));
+
 INSTANTIATE_TEST_CASE_P(ResizeFxFyPerfTestFluid, ResizeFxFyPerfTest,
-    Combine(Values(AbsExact().to_compare_f()),
-        Values(CV_8UC3/*CV_8UC1, CV_16UC1, CV_16SC1*/),
-        Values(/*cv::INTER_NEAREST,*/ cv::INTER_LINEAR/*, cv::INTER_AREA*/),
+    Combine(Values(Tolerance_FloatRel_IntAbs(1e-5, 1).to_compare_f()),
+        Values(CV_8UC3),
+        Values(cv::INTER_LINEAR),
        Values(szSmall128, szVGA, sz720p, sz1080p),
        Values(0.5, 0.1),
        Values(0.5, 0.1),
--- a/modules/gapi/src/backends/fluid/gfluidcore.cpp
+++ b/modules/gapi/src/backends/fluid/gfluidcore.cpp
@ -19,6 +19,10 @@
 #include <opencv2/gapi/fluid/gfluidkernel.hpp>
 #include <opencv2/gapi/fluid/core.hpp>

+#if CV_SSE4_1
+#include "gfluidcore_simd_sse41.hpp"
+#endif
+
 #include "gfluidbuffer_priv.hpp"
 #include "gfluidbackend.hpp"
 #include "gfluidutils.hpp"
@ -2949,109 +2953,295 @@ GAPI_FLUID_KERNEL(GFluidPhase, cv::gapi::core::GPhase, false)
    }
 };

-GAPI_FLUID_KERNEL(GFluidResize, cv::gapi::core::GResize, true)
-{
-    static const int Window = 1;
-    static const auto Kind = GFluidKernel::Kind::Resize;
+template<typename T, typename Mapper, int chanNum>
+struct LinearScratchDesc {
+    using alpha_t = typename Mapper::alpha_type;
+    using index_t = typename Mapper::index_type;
+
+    alpha_t* alpha;
+    alpha_t* clone;
+    index_t* mapsx;
+    alpha_t* beta;
+    index_t* mapsy;
+    T*       tmp;
+
+    LinearScratchDesc(int /*inW*/, int /*inH*/, int outW, int outH,  void* data) {
+        alpha = reinterpret_cast<alpha_t*>(data);
+        clone = reinterpret_cast<alpha_t*>(alpha + outW);
+        mapsx = reinterpret_cast<index_t*>(clone + outW*4);
+        beta  = reinterpret_cast<alpha_t*>(mapsx + outW);
+        mapsy = reinterpret_cast<index_t*>(beta  + outH);
+        tmp   = reinterpret_cast<T*>      (mapsy + outH*2);
+    }
+
+    static int bufSize(int inW, int /*inH*/, int outW, int outH, int lpi) {
+        auto size = outW * sizeof(alpha_t)     +
+                    outW * sizeof(alpha_t) * 4 +  // alpha clones
+                    outW * sizeof(index_t)     +
+                    outH * sizeof(alpha_t)     +
+                    outH * sizeof(index_t) * 2 +
+                     inW * sizeof(T) * lpi * chanNum;
+
+        return static_cast<int>(size);
+    }
+};
+static inline double invRatio(int inSz, int outSz) {
+    return static_cast<double>(outSz) / inSz;
+}
+
+static inline double ratio(int inSz, int outSz) {
+    return 1 / invRatio(inSz, outSz);
+}
+
+template<typename T, typename Mapper, int chanNum = 1>
+static inline void initScratchLinear(const cv::GMatDesc& in,
+                                     const         Size& outSz,
+                                     cv::gapi::fluid::Buffer& scratch,
+                                     int  lpi) {
+    using alpha_type = typename Mapper::alpha_type;
+    static const auto unity = Mapper::unity;
+
+    auto inSz = in.size;
+    auto sbufsize = LinearScratchDesc<T, Mapper, chanNum>::bufSize(inSz.width, inSz.height, outSz.width, outSz.height, lpi);
+
+    Size scratch_size{sbufsize, 1};
+
+    cv::GMatDesc desc;
+    desc.chan = 1;
+    desc.depth = CV_8UC1;
+    desc.size = scratch_size;
+
+    cv::gapi::fluid::Buffer buffer(desc);
+    scratch = std::move(buffer);
+
+    double hRatio = ratio(in.size.width, outSz.width);
+    double vRatio = ratio(in.size.height, outSz.height);
+
+    LinearScratchDesc<T, Mapper, chanNum> scr(inSz.width, inSz.height, outSz.width, outSz.height, scratch.OutLineB());
+
+    auto *alpha = scr.alpha;
+    auto *clone = scr.clone;
+    auto *index = scr.mapsx;
+
+    for (int x = 0; x < outSz.width; x++) {
+        auto map = Mapper::map(hRatio, 0, in.size.width, x);
+        auto alpha0 = map.alpha0;
+        auto index0 = map.index0;
+
+        // TRICK:
+        // Algorithm takes pair of input pixels, sx0'th and sx1'th,
+        // and compute result as alpha0*src[sx0] + alpha1*src[sx1].
+        // By definition: sx1 == sx0 + 1 either sx1 == sx0, and
+        // alpha0 + alpha1 == unity (scaled appropriately).
+        // Here we modify formulas for alpha0 and sx1: by assuming
+        // that sx1 == sx0 + 1 always, and patching alpha0 so that
+        // result remains intact.
+        // Note that we need in.size.width >= 2, for both sx0 and
+        // sx0+1 were indexing pixels inside the input's width.
+        if (map.index1 != map.index0 + 1) {
+            GAPI_DbgAssert(map.index1 == map.index0);
+            GAPI_DbgAssert(in.size.width >= 2);
+            if (map.index0 < in.size.width-1) {
+                // sx1=sx0+1 fits inside row,
+                // make sure alpha0=unity and alpha1=0,
+                // so that result equals src[sx0]*unity
+                alpha0 = saturate_cast<alpha_type>(unity);
+            } else {
+                // shift sx0 to left by 1 pixel,
+                // and make sure that alpha0=0 and alpha1==1,
+                // so that result equals to src[sx0+1]*unity
+                alpha0 = 0;
+                index0--;
+            }
+        }

-    constexpr static const int INTER_RESIZE_COEF_BITS = 11;
-    constexpr static const int INTER_RESIZE_COEF_SCALE = 1 << INTER_RESIZE_COEF_BITS;
-    constexpr static const short ONE = INTER_RESIZE_COEF_SCALE;
+        alpha[x] = alpha0;
+        index[x] = index0;

-    struct ResizeUnit
-    {
-        short alpha0;
-        short alpha1;
-        int   s0;
-        int   s1;
-    };
+        for (int l = 0; l < 4; l++) {
+            clone[4*x + l] = alpha0;
+        }
+    }

-    static ResizeUnit map(double ratio, int start, int max, int outCoord)
-    {
-        float f = static_cast<float>((outCoord + 0.5f) * ratio - 0.5f);
+    auto *beta    = scr.beta;
+    auto *index_y = scr.mapsy;
+
+    for (int y = 0; y < outSz.height; y++) {
+        auto mapY = Mapper::map(vRatio, 0, in.size.height, y);
+        beta[y] = mapY.alpha0;
+        index_y[y] = mapY.index0;
+        index_y[outSz.height + y] = mapY.index1;
+    }
+}
+
+template<typename F, typename I>
+struct MapperUnit {
+    F alpha0, alpha1;
+    I index0, index1;
+};
+
+inline static uint8_t calc(short alpha0, uint8_t src0, short alpha1, uint8_t src1) {
+    constexpr static const int half = 1 << 14;
+    return (src0 * alpha0 + src1 * alpha1 + half) >> 15;
+}
+struct Mapper {
+    constexpr static const int ONE = 1 << 15;
+    typedef short alpha_type;
+    typedef short index_type;
+    constexpr static const int unity = ONE;
+
+    typedef MapperUnit<short, short> Unit;
+
+    static inline Unit map(double ratio, int start, int max, int outCoord) {
+        float f = static_cast<float>((outCoord + 0.5) * ratio - 0.5);
        int s = cvFloor(f);
        f -= s;

-        ResizeUnit ru;
+        Unit u;

-        ru.s0 = std::max(s - start, 0);
-        ru.s1 = ((f == 0.0) || s + 1 >= max) ? s - start : s - start + 1;
+        u.index0 = static_cast<short>(std::max(s - start, 0));
+        u.index1 = static_cast<short>(((f == 0.0) || s + 1 >= max) ? s - start : s - start + 1);

-        ru.alpha0 = saturate_cast<short>((1.0f - f) * INTER_RESIZE_COEF_SCALE);
-        ru.alpha1 = saturate_cast<short>((f) * INTER_RESIZE_COEF_SCALE);
+        u.alpha0 = saturate_cast<short>(ONE * (1.0f - f));
+        u.alpha1 = saturate_cast<short>(ONE * f);

-        return ru;
+        return u;
    }
+};

-    static void initScratch(const cv::GMatDesc& in,
-                            cv::Size outSz, double fx, double fy, int /*interp*/,
-                            cv::gapi::fluid::Buffer &scratch)
-    {
-        GAPI_Assert(in.depth == CV_8U && in.chan == 3);
+template<typename T, class Mapper, int numChan>
+static void calcRowLinearC(const cv::gapi::fluid::View  & in,
+                           cv::gapi::fluid::Buffer& out,
+                           cv::gapi::fluid::Buffer& scratch) {
+    using alpha_type = typename Mapper::alpha_type;

-        if (outSz.area() == 0)
-        {
-            outSz.width  = static_cast<int>(round(in.size.width  * fx));
-            outSz.height = static_cast<int>(round(in.size.height * fy));
-        }
+    auto  inSz =  in.meta().size;
+    auto outSz = out.meta().size;

-        cv::Size scratch_size{static_cast<int>(outSz.width * sizeof(ResizeUnit)), 1};
+    auto inY  = in.y();
+    int outY = out.y();
+    int lpi = out.lpi();

-        cv::GMatDesc desc;
-        desc.chan  = 1;
-        desc.depth = CV_8UC1;
-        desc.size  = scratch_size;
+    GAPI_DbgAssert(outY + lpi <= outSz.height);
+    GAPI_DbgAssert(lpi <= 4);

-        cv::gapi::fluid::Buffer buffer(desc);
-        scratch = std::move(buffer);
+    LinearScratchDesc<T, Mapper, numChan> scr(inSz.width, inSz.height, outSz.width, outSz.height, scratch.OutLineB());

-        ResizeUnit* mapX = scratch.OutLine<ResizeUnit>();
-        double hRatio = (double)in.size.width / outSz.width;
+    const auto *alpha = scr.alpha;
+    const auto *mapsx = scr.mapsx;
+    const auto *beta_0 = scr.beta;
+    const auto *mapsy = scr.mapsy;

-        for (int x = 0, w = outSz.width; x < w; x++)
-        {
-            mapX[x] = map(hRatio, 0, in.size.width, x);
-        }
+    const auto *beta = beta_0 + outY;
+    const T *src0[4];
+    const T *src1[4];
+    T* dst[4];
+
+    for (int l = 0; l < lpi; l++) {
+        auto index0 = mapsy[outY + l] - inY;
+        auto index1 = mapsy[outSz.height + outY + l] - inY;
+        src0[l] = in.InLine<const T>(index0);
+        src1[l] = in.InLine<const T>(index1);
+        dst[l] = out.OutLine<T>(l);
    }

-    static void resetScratch(cv::gapi::fluid::Buffer& /*scratch*/)
-    {}
+#if CV_SSE4_1
+    const auto* clone = scr.clone;
+    auto* tmp = scr.tmp;

-    static void run(const cv::gapi::fluid::View& in, cv::Size /*sz*/, double /*fx*/, double /*fy*/, int /*interp*/,
-                    cv::gapi::fluid::Buffer& out, cv::gapi::fluid::Buffer &scratch)
+    if (inSz.width >= 16 && outSz.width >= 16)
    {
-        double vRatio = (double)in.meta().size.height / out.meta().size.height;
-        auto mapY = map(vRatio, in.y(), in.meta().size.height, out.y());
-
-        auto beta0 = mapY.alpha0;
-        auto beta1 = mapY.alpha1;
+        sse42::calcRowLinear_8UC_Impl_<numChan>(reinterpret_cast<uint8_t**>(dst),
+                                                reinterpret_cast<const uint8_t**>(src0),
+                                                reinterpret_cast<const uint8_t**>(src1),
+                                                reinterpret_cast<const short*>(alpha),
+                                                reinterpret_cast<const short*>(clone),
+                                                reinterpret_cast<const short*>(mapsx),
+                                                reinterpret_cast<const short*>(beta),
+                                                reinterpret_cast<uint8_t*>(tmp),
+                                                inSz, outSz, lpi);

-        const auto src0 = in.InLine <unsigned char>(mapY.s0);
-        const auto src1 = in.InLine <unsigned char>(mapY.s1);
-
-        auto dst = out.OutLine<unsigned char>();
+        return;
+    }
+#endif // CV_SSE4_1
+    int length = out.length();
+    for (int l = 0; l < lpi; l++) {
+        constexpr static const auto unity = Mapper::unity;
+
+        auto beta0 =                                   beta[l];
+        auto beta1 = saturate_cast<alpha_type>(unity - beta[l]);
+
+        for (int x = 0; x < length; x++) {
+            auto alpha0 =                                   alpha[x];
+            auto alpha1 = saturate_cast<alpha_type>(unity - alpha[x]);
+            auto sx0 = mapsx[x];
+            auto sx1 = sx0 + 1;
+
+            for (int c = 0; c < numChan; c++) {
+                auto idx0 = numChan*sx0 + c;
+                auto idx1 = numChan*sx1 + c;
+                T tmp0 = calc(beta0, src0[l][idx0], beta1, src1[l][idx0]);
+                T tmp1 = calc(beta0, src0[l][idx1], beta1, src1[l][idx1]);
+                dst[l][numChan * x + c] = calc(alpha0, tmp0, alpha1, tmp1);
+            }
+        }
+    }
+}

-        ResizeUnit* mapX = scratch.OutLine<ResizeUnit>();
+GAPI_FLUID_KERNEL(GFluidResize, cv::gapi::core::GResize, true)
+{
+    static const int Window = 1;
+    static const int LPI = 4;
+    static const auto Kind = GFluidKernel::Kind::Resize;

-        for (int x = 0; x < out.length(); x++)
-        {
-            short alpha0 = mapX[x].alpha0;
-            short alpha1 = mapX[x].alpha1;
-            int sx0 = mapX[x].s0;
-            int sx1 = mapX[x].s1;
+    constexpr static const int INTER_RESIZE_COEF_BITS = 11;
+    constexpr static const int INTER_RESIZE_COEF_SCALE = 1 << INTER_RESIZE_COEF_BITS;
+    constexpr static const short ONE = INTER_RESIZE_COEF_SCALE;

-            int res00 = src0[3*sx0    ]*alpha0 + src0[3*(sx1)    ]*alpha1;
-            int res10 = src1[3*sx0    ]*alpha0 + src1[3*(sx1)    ]*alpha1;
+   static void initScratch(const cv::GMatDesc& in,
+                           cv::Size outSz, double fx, double fy, int /*interp*/,
+                           cv::gapi::fluid::Buffer &scratch)
+    {
+       int outSz_w;
+       int outSz_h;
+       if (outSz.width == 0 || outSz.height == 0)
+       {
+           outSz_w = static_cast<int>(round(in.size.width * fx));
+           outSz_h = static_cast<int>(round(in.size.height * fy));
+       }
+       else
+       {
+           outSz_w = outSz.width;
+           outSz_h = outSz.height;
+       }
+       cv::Size outSize(outSz_w, outSz_h);
+
+       if (in.chan == 3)
+       {
+           initScratchLinear<uchar, Mapper, 3>(in, outSize, scratch, LPI);
+       }
+       else if (in.chan == 4)
+       {
+           initScratchLinear<uchar, Mapper, 4>(in, outSize, scratch, LPI);
+       }
+    }

-            int res01 = src0[3*sx0 + 1]*alpha0 + src0[3*(sx1) + 1]*alpha1;
-            int res11 = src1[3*sx0 + 1]*alpha0 + src1[3*(sx1) + 1]*alpha1;
+    static void resetScratch(cv::gapi::fluid::Buffer& /*scratch*/)
+    {}

-            int res02 = src0[3*sx0 + 2]*alpha0 + src0[3*(sx1) + 2]*alpha1;
-            int res12 = src1[3*sx0 + 2]*alpha0 + src1[3*(sx1) + 2]*alpha1;
+    static void run(const cv::gapi::fluid::View& in, cv::Size /*sz*/, double /*fx*/, double /*fy*/, int interp,
+                    cv::gapi::fluid::Buffer& out,
+                    cv::gapi::fluid::Buffer& scratch) {
+        const int channels = in.meta().chan;
+        GAPI_Assert((channels == 3 || channels == 4) && (interp == cv::INTER_LINEAR));

-            dst[3*x    ] = uchar(( ((beta0 * (res00 >> 4)) >> 16) + ((beta1 * (res10 >> 4)) >> 16) + 2)>>2);
-            dst[3*x + 1] = uchar(( ((beta0 * (res01 >> 4)) >> 16) + ((beta1 * (res11 >> 4)) >> 16) + 2)>>2);
-            dst[3*x + 2] = uchar(( ((beta0 * (res02 >> 4)) >> 16) + ((beta1 * (res12 >> 4)) >> 16) + 2)>>2);
+        if (channels == 3)
+        {
+            calcRowLinearC<uint8_t, Mapper, 3>(in, out, scratch);
+        }
+        else if (channels == 4)
+        {
+            calcRowLinearC<uint8_t, Mapper, 4>(in, out, scratch);
        }
    }
 };
--- a/modules/gapi/src/backends/fluid/gfluidcore_simd_sse41.hpp
+++ b/modules/gapi/src/backends/fluid/gfluidcore_simd_sse41.hpp
@ -0,0 +1,733 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2021 Intel Corporation
+
+#if !defined(GAPI_STANDALONE)
+
+#include "opencv2/gapi/own/saturate.hpp"
+
+#include <smmintrin.h>
+
+#include "opencv2/core.hpp"
+
+#include <opencv2/core/hal/intrin.hpp>
+
+#include <cstdint>
+#include <cstring>
+
+#include <algorithm>
+#include <limits>
+#include <vector>
+
+#if defined __GNUC__
+# pragma GCC diagnostic push
+# pragma GCC diagnostic ignored "-Wstrict-overflow"
+#endif
+namespace cv {
+namespace gapi {
+namespace fluid {
+namespace sse42 {
+
+CV_ALWAYS_INLINE void v_gather_pixel_map(v_uint8x16& vec, const uchar src[], const short* index, const int pos)
+{
+    const int chanNum = 4;
+
+    // pixel_1 (rgbx)
+    vec.val = _mm_insert_epi32(vec.val, *reinterpret_cast<const int*>(&src[chanNum * (*index + pos)]), 0);
+    // pixel_2 (rgbx)
+    vec.val = _mm_insert_epi32(vec.val, *reinterpret_cast<const int*>(&src[chanNum * (*(index + 1) + pos)]), 1);
+    // pixel_3
+    vec.val = _mm_insert_epi32(vec.val, *reinterpret_cast<const int*>(&src[chanNum * (*(index + 2) + pos)]), 2);
+    // pixel_4
+    vec.val = _mm_insert_epi32(vec.val, *reinterpret_cast<const int*>(&src[chanNum * (*(index + 3) + pos)]), 3);
+}
+
+CV_ALWAYS_INLINE void resize_vertical_anyLPI(const uchar* src0, const uchar* src1,
+                                             uchar* dst, const int inLength,
+                                             const short beta) {
+    constexpr int nlanes = 16;
+    __m128i zero = _mm_setzero_si128();
+    __m128i b = _mm_set1_epi16(beta);
+
+    for (int w = 0; inLength >= nlanes;)
+    {
+        for (; w <= inLength - nlanes; w += nlanes)
+        {
+            __m128i s0 = _mm_lddqu_si128(reinterpret_cast<const __m128i*>(&src0[w]));
+            __m128i s1 = _mm_lddqu_si128(reinterpret_cast<const __m128i*>(&src1[w]));
+            __m128i a1 = _mm_unpacklo_epi8(s0, zero);
+            __m128i b1 = _mm_unpacklo_epi8(s1, zero);
+            __m128i a2 = _mm_unpackhi_epi8(s0, zero);
+            __m128i b2 = _mm_unpackhi_epi8(s1, zero);
+            __m128i r1 = _mm_mulhrs_epi16(_mm_sub_epi16(a1, b1), b);
+            __m128i r2 = _mm_mulhrs_epi16(_mm_sub_epi16(a2, b2), b);
+            __m128i res1 = _mm_add_epi16(r1, b1);
+            __m128i res2 = _mm_add_epi16(r2, b2);
+            _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + w), _mm_packus_epi16(res1, res2));
+        }
+
+        if (w < inLength) {
+            w = inLength - nlanes;
+            continue;
+        }
+        break;
+    }
+}
+
+
+CV_ALWAYS_INLINE void resize_horizontal_anyLPI(uint8_t* dst,
+                                               const uchar* src, const short mapsx[],
+                                               const short alpha[], const int width)
+{
+    constexpr int nlanes = 16;
+    constexpr int chanNum = 3;
+    __m128i zero = _mm_setzero_si128();
+
+    for (int x = 0; width >= nlanes;)
+    {
+        for (; x <= width - nlanes; x += nlanes)
+        {
+            __m128i a012 = _mm_setr_epi16(alpha[x], alpha[x], alpha[x], alpha[x + 1],
+                                          alpha[x + 1], alpha[x + 1], alpha[x + 2], alpha[x + 2]);
+            __m128i a2345 = _mm_setr_epi16(alpha[x + 2], alpha[x + 3], alpha[x + 3], alpha[x + 3],
+                                           alpha[x + 4], alpha[x + 4], alpha[x + 4], alpha[x + 5]);
+
+            __m128i a567 = _mm_setr_epi16(alpha[x + 5], alpha[x + 5], alpha[x + 6], alpha[x + 6],
+                                          alpha[x + 6], alpha[x + 7], alpha[x + 7], alpha[x + 7]);
+            __m128i a8910 = _mm_setr_epi16(alpha[x + 8], alpha[x + 8], alpha[x + 8], alpha[x + 9],
+                                           alpha[x + 9], alpha[x + 9], alpha[x + 10], alpha[x + 10]);
+
+            __m128i a10111213 = _mm_setr_epi16(alpha[x + 10], alpha[x + 11], alpha[x + 11], alpha[x + 11],
+                                               alpha[x + 12], alpha[x + 12], alpha[x + 12], alpha[x + 13]);
+            __m128i a131415 = _mm_setr_epi16(alpha[x + 13], alpha[x + 13], alpha[x + 14], alpha[x + 14],
+                                             alpha[x + 14], alpha[x + 15], alpha[x + 15], alpha[x + 15]);
+
+            __m128i a1 = _mm_setr_epi8(src[chanNum * (mapsx[x] + 0)],     src[chanNum * (mapsx[x] + 0) + 1],     src[chanNum * (mapsx[x] + 0) + 2],
+                                       src[chanNum * (mapsx[x + 1] + 0)], src[chanNum * (mapsx[x + 1] + 0) + 1], src[chanNum * (mapsx[x + 1] + 0) + 2],
+                                       src[chanNum * (mapsx[x + 2] + 0)], src[chanNum * (mapsx[x + 2] + 0) + 1], src[chanNum * (mapsx[x + 2] + 0) + 2],
+                                       src[chanNum * (mapsx[x + 3] + 0)], src[chanNum * (mapsx[x + 3] + 0) + 1], src[chanNum * (mapsx[x + 3] + 0) + 2],
+                                       src[chanNum * (mapsx[x + 4] + 0)], src[chanNum * (mapsx[x + 4] + 0) + 1], src[chanNum * (mapsx[x + 4] + 0) + 2],
+                                       src[chanNum * (mapsx[x + 5] + 0)]);
+            __m128i b1 = _mm_setr_epi8(src[chanNum * (mapsx[x] + 1)],     src[chanNum * (mapsx[x] + 1) + 1],     src[chanNum * (mapsx[x] + 1) + 2],
+                                       src[chanNum * (mapsx[x + 1] + 1)], src[chanNum * (mapsx[x + 1] + 1) + 1], src[chanNum * (mapsx[x + 1] + 1) + 2],
+                                       src[chanNum * (mapsx[x + 2] + 1)], src[chanNum * (mapsx[x + 2] + 1) + 1], src[chanNum * (mapsx[x + 2] + 1) + 2],
+                                       src[chanNum * (mapsx[x + 3] + 1)], src[chanNum * (mapsx[x + 3] + 1) + 1], src[chanNum * (mapsx[x + 3] + 1) + 2],
+                                       src[chanNum * (mapsx[x + 4] + 1)], src[chanNum * (mapsx[x + 4] + 1) + 1], src[chanNum * (mapsx[x + 4] + 1) + 2],
+                                       src[chanNum * (mapsx[x + 5] + 1)]);
+
+            __m128i a2 = _mm_setr_epi8(src[chanNum * (mapsx[x + 5] + 0) + 1], src[chanNum * (mapsx[x + 5] + 0) + 2], src[chanNum * (mapsx[x + 6] + 0)],
+                                       src[chanNum * (mapsx[x + 6] + 0) + 1], src[chanNum * (mapsx[x + 6] + 0) + 2], src[chanNum * (mapsx[x + 7] + 0)],
+                                       src[chanNum * (mapsx[x + 7] + 0) + 1], src[chanNum * (mapsx[x + 7] + 0) + 2], src[chanNum * (mapsx[x + 8] + 0)],
+                                       src[chanNum * (mapsx[x + 8] + 0) + 1], src[chanNum * (mapsx[x + 8] + 0) + 2], src[chanNum * (mapsx[x + 9] + 0)],
+                                       src[chanNum * (mapsx[x + 9] + 0) + 1], src[chanNum * (mapsx[x + 9] + 0) + 2], src[chanNum * (mapsx[x + 10] + 0)],
+                                       src[chanNum * (mapsx[x + 10] + 0) + 1]);
+
+            __m128i b2 = _mm_setr_epi8(src[chanNum * (mapsx[x + 5] + 1) + 1], src[chanNum * (mapsx[x + 5] + 1) + 2], src[chanNum * (mapsx[x + 6] + 1)],
+                                       src[chanNum * (mapsx[x + 6] + 1) + 1], src[chanNum * (mapsx[x + 6] + 1) + 2], src[chanNum * (mapsx[x + 7] + 1)],
+                                       src[chanNum * (mapsx[x + 7] + 1) + 1], src[chanNum * (mapsx[x + 7] + 1) + 2], src[chanNum * (mapsx[x + 8] + 1)],
+                                       src[chanNum * (mapsx[x + 8] + 1) + 1], src[chanNum * (mapsx[x + 8] + 1) + 2], src[chanNum * (mapsx[x + 9] + 1)],
+                                       src[chanNum * (mapsx[x + 9] + 1) + 1], src[chanNum * (mapsx[x + 9] + 1) + 2], src[chanNum * (mapsx[x + 10] + 1)],
+                                       src[chanNum * (mapsx[x + 10] + 1) + 1]);
+
+            __m128i a3 = _mm_setr_epi8(src[chanNum * (mapsx[x + 10] + 0) + 2], src[chanNum * (mapsx[x + 11] + 0)], src[chanNum * (mapsx[x + 11] + 0) + 1],
+                                       src[chanNum * (mapsx[x + 11] + 0) + 2], src[chanNum * (mapsx[x + 12] + 0)], src[chanNum * (mapsx[x + 12] + 0) + 1],
+                                       src[chanNum * (mapsx[x + 12] + 0) + 2], src[chanNum * (mapsx[x + 13] + 0)], src[chanNum * (mapsx[x + 13] + 0) + 1],
+                                       src[chanNum * (mapsx[x + 13] + 0) + 2], src[chanNum * (mapsx[x + 14] + 0)], src[chanNum * (mapsx[x + 14] + 0) + 1],
+                                       src[chanNum * (mapsx[x + 14] + 0) + 2], src[chanNum * (mapsx[x + 15] + 0)], src[chanNum * (mapsx[x + 15] + 0) + 1],
+                                       src[chanNum * (mapsx[x + 15] + 0) + 2]);
+
+            __m128i b3 = _mm_setr_epi8(src[chanNum * (mapsx[x + 10] + 1) + 2], src[chanNum * (mapsx[x + 11] + 1)], src[chanNum * (mapsx[x + 11] + 1) + 1],
+                                       src[chanNum * (mapsx[x + 11] + 1) + 2], src[chanNum * (mapsx[x + 12] + 1)], src[chanNum * (mapsx[x + 12] + 1) + 1],
+                                       src[chanNum * (mapsx[x + 12] + 1) + 2], src[chanNum * (mapsx[x + 13] + 1)], src[chanNum * (mapsx[x + 13] + 1) + 1],
+                                       src[chanNum * (mapsx[x + 13] + 1) + 2], src[chanNum * (mapsx[x + 14] + 1)], src[chanNum * (mapsx[x + 14] + 1) + 1],
+                                       src[chanNum * (mapsx[x + 14] + 1) + 2], src[chanNum * (mapsx[x + 15] + 1)], src[chanNum * (mapsx[x + 15] + 1) + 1],
+                                       src[chanNum * (mapsx[x + 15] + 1) + 2]);
+
+            __m128i a11 = _mm_unpacklo_epi8(a1, zero);
+            __m128i a12 = _mm_unpackhi_epi8(a1, zero);
+            __m128i a21 = _mm_unpacklo_epi8(a2, zero);
+            __m128i a22 = _mm_unpackhi_epi8(a2, zero);
+            __m128i a31 = _mm_unpacklo_epi8(a3, zero);
+            __m128i a32 = _mm_unpackhi_epi8(a3, zero);
+            __m128i b11 = _mm_unpacklo_epi8(b1, zero);
+            __m128i b12 = _mm_unpackhi_epi8(b1, zero);
+            __m128i b21 = _mm_unpacklo_epi8(b2, zero);
+            __m128i b22 = _mm_unpackhi_epi8(b2, zero);
+            __m128i b31 = _mm_unpacklo_epi8(b3, zero);
+            __m128i b32 = _mm_unpackhi_epi8(b3, zero);
+
+            __m128i r1 = _mm_mulhrs_epi16(_mm_sub_epi16(a11, b11), a012);
+            __m128i r2 = _mm_mulhrs_epi16(_mm_sub_epi16(a12, b12), a2345);
+            __m128i r3 = _mm_mulhrs_epi16(_mm_sub_epi16(a21, b21), a567);
+            __m128i r4 = _mm_mulhrs_epi16(_mm_sub_epi16(a22, b22), a8910);
+            __m128i r5 = _mm_mulhrs_epi16(_mm_sub_epi16(a31, b31), a10111213);
+            __m128i r6 = _mm_mulhrs_epi16(_mm_sub_epi16(a32, b32), a131415);
+
+            __m128i r_1 = _mm_add_epi16(b11, r1);
+            __m128i r_2 = _mm_add_epi16(b12, r2);
+            __m128i r_3 = _mm_add_epi16(b21, r3);
+            __m128i r_4 = _mm_add_epi16(b22, r4);
+            __m128i r_5 = _mm_add_epi16(b31, r5);
+            __m128i r_6 = _mm_add_epi16(b32, r6);
+
+            __m128i res1 = _mm_packus_epi16(r_1, r_2);
+            __m128i res2 = _mm_packus_epi16(r_3, r_4);
+            __m128i res3 = _mm_packus_epi16(r_5, r_6);
+            _mm_storeu_si128(reinterpret_cast<__m128i*>(&dst[chanNum * x]), res1);
+            _mm_storeu_si128(reinterpret_cast<__m128i*>(&dst[chanNum * x + 16]), res2);
+            _mm_storeu_si128(reinterpret_cast<__m128i*>(&dst[chanNum * x + 32]), res3);
+        }
+        if (x < width) {
+            x = width - nlanes;
+            continue;
+        }
+        break;
+    }
+}
+
+template<int chanNum>
+CV_ALWAYS_INLINE void calcRowLinear_8UC_Impl_(uint8_t**,
+                                              const uint8_t**,
+                                              const uint8_t**,
+                                              const short* ,
+                                              const short* ,
+                                              const short*,
+                                              const short* ,
+                                                  uint8_t*,
+                                              const Size& ,
+                                              const Size& ,
+                                              const int )
+{
+    static_assert(chanNum != 3, "Unsupported number of channel");
+}
+template<>
+CV_ALWAYS_INLINE void calcRowLinear_8UC_Impl_<3>(uint8_t* dst[],
+                                              const uint8_t* src0[],
+                                              const uint8_t* src1[],
+                                              const short    alpha[],
+                                              const short*   clone,  // 4 clones of alpha
+                                              const short    mapsx[],
+                                              const short    beta[],
+                                                  uint8_t    tmp[],
+                                              const Size&    inSz,
+                                              const Size&    outSz,
+                                              const int      lpi) {
+    bool xRatioEq = inSz.width == outSz.width;
+    bool yRatioEq = inSz.height == outSz.height;
+    constexpr int nlanes = 16;
+    constexpr int half_nlanes = 16 / 2;
+    constexpr int chanNum = 3;
+
+    if (!xRatioEq && !yRatioEq) {
+        int inLength = inSz.width * chanNum;
+
+        if (lpi == 4)
+        {
+            // vertical pass
+            __m128i b0 = _mm_set1_epi16(beta[0]);
+            __m128i b1 = _mm_set1_epi16(beta[1]);
+            __m128i b2 = _mm_set1_epi16(beta[2]);
+            __m128i b3 = _mm_set1_epi16(beta[3]);
+            __m128i zero = _mm_setzero_si128();
+            __m128i vertical_shuf_mask = _mm_setr_epi8(0, 8, 4, 12, 1, 9, 5, 13, 2, 10, 6, 14, 3, 11, 7, 15);
+
+            for (int w = 0; w < inSz.width * chanNum; ) {
+                for (; w <= inSz.width * chanNum - half_nlanes && w >= 0; w += half_nlanes) {
+#ifdef __i386__
+                    __m128i val0lo = _mm_castpd_si128(_mm_loadh_pd(
+                                                      _mm_load_sd(reinterpret_cast<const double*>(&src0[0][w])),
+                                                                  reinterpret_cast<const double*>(&src0[1][w])));
+                    __m128i val0hi = _mm_castpd_si128(_mm_loadh_pd(
+                                                      _mm_load_sd(reinterpret_cast<const double*>(&src0[2][w])),
+                                                                  reinterpret_cast<const double*>(&src0[3][w])));
+                    __m128i val1lo = _mm_castpd_si128(_mm_loadh_pd(
+                                                      _mm_load_sd(reinterpret_cast<const double*>(&src1[0][w])),
+                                                                  reinterpret_cast<const double*>(&src1[1][w])));
+                    __m128i val1hi = _mm_castpd_si128(_mm_loadh_pd(
+                                                      _mm_load_sd(reinterpret_cast<const double*>(&src1[2][w])),
+                                                                  reinterpret_cast<const double*>(&src1[3][w])));
+#else
+                    __m128i val0lo = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(&src0[0][w])),
+                                                      *reinterpret_cast<const int64_t*>(&src0[1][w]), 1);
+                    __m128i val0hi = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(&src0[2][w])),
+                                                      *reinterpret_cast<const int64_t*>(&src0[3][w]), 1);
+                    __m128i val1lo = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(&src1[0][w])),
+                                                      *reinterpret_cast<const int64_t*>(&src1[1][w]), 1);
+                    __m128i val1hi = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(&src1[2][w])),
+                                                      *reinterpret_cast<const int64_t*>(&src1[3][w]), 1);
+#endif
+                    __m128i val0_0 = _mm_cvtepu8_epi16(val0lo);
+                    __m128i val0_2 = _mm_cvtepu8_epi16(val0hi);
+                    __m128i val1_0 = _mm_cvtepu8_epi16(val1lo);
+                    __m128i val1_2 = _mm_cvtepu8_epi16(val1hi);
+
+                    __m128i val0_1 = _mm_unpackhi_epi8(val0lo, zero);
+                    __m128i val0_3 = _mm_unpackhi_epi8(val0hi, zero);
+                    __m128i val1_1 = _mm_unpackhi_epi8(val1lo, zero);
+                    __m128i val1_3 = _mm_unpackhi_epi8(val1hi, zero);
+
+                    __m128i t0 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_0, val1_0), b0);
+                    __m128i t1 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_1, val1_1), b1);
+                    __m128i t2 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_2, val1_2), b2);
+                    __m128i t3 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_3, val1_3), b3);
+
+                    __m128i r0 = _mm_add_epi16(val1_0, t0);
+                    __m128i r1 = _mm_add_epi16(val1_1, t1);
+                    __m128i r2 = _mm_add_epi16(val1_2, t2);
+                    __m128i r3 = _mm_add_epi16(val1_3, t3);
+
+                    __m128i q0 = _mm_packus_epi16(r0, r1);
+                    __m128i q1 = _mm_packus_epi16(r2, r3);
+
+                    __m128i q2 = _mm_blend_epi16(q0, _mm_slli_si128(q1, 4), 0xCC /*0b11001100*/);
+                    __m128i q3 = _mm_blend_epi16(_mm_srli_si128(q0, 4), q1, 0xCC /*0b11001100*/);
+
+                    __m128i q4 = _mm_shuffle_epi8(q2, vertical_shuf_mask);
+                    __m128i q5 = _mm_shuffle_epi8(q3, vertical_shuf_mask);
+
+                    _mm_storeu_si128(reinterpret_cast<__m128i*>(&tmp[4 * w + 0]), q4);
+                    _mm_storeu_si128(reinterpret_cast<__m128i*>(&tmp[4 * w + 16]), q5);
+                }
+
+                if (w < inSz.width * chanNum) {
+                    w = inSz.width * chanNum - half_nlanes;
+                }
+            }
+
+            // horizontal pass
+            __m128i horizontal_shuf_mask = _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
+
+            for (int x = 0; outSz.width >= nlanes; )
+            {
+                for (; x <= outSz.width - nlanes; x += nlanes)
+                {
+#ifdef _WIN64
+                    __m128i a00 = _mm_setr_epi64x(*reinterpret_cast<const int64_t*>(&clone[4 * x]), *reinterpret_cast<const int64_t*>(&clone[4 * x]));
+                    __m128i a01 = _mm_setr_epi64x(*reinterpret_cast<const int64_t*>(&clone[4 * x]), *reinterpret_cast<const int64_t*>(&clone[4 * (x + 1)]));
+                    __m128i a11 = _mm_setr_epi64x(*reinterpret_cast<const int64_t*>(&clone[4 * (x + 1)]), *reinterpret_cast<const int64_t*>(&clone[4 * (x + 1)]));
+                    __m128i a22 = _mm_setr_epi64x(*reinterpret_cast<const int64_t*>(&clone[4 * (x + 2)]), *reinterpret_cast<const int64_t*>(&clone[4 * (x + 2)]));
+                    __m128i a23 = _mm_setr_epi64x(*reinterpret_cast<const int64_t*>(&clone[4 * (x + 2)]), *reinterpret_cast<const int64_t*>(&clone[4 * (x + 3)]));
+                    __m128i a33 = _mm_setr_epi64x(*reinterpret_cast<const int64_t*>(&clone[4 * (x + 3)]), *reinterpret_cast<const int64_t*>(&clone[4 * (x + 3)]));
+                    __m128i a44 = _mm_setr_epi64x(*reinterpret_cast<const int64_t*>(&clone[4 * (x + 4)]), *reinterpret_cast<const int64_t*>(&clone[4 * (x + 4)]));
+                    __m128i a45 = _mm_setr_epi64x(*reinterpret_cast<const int64_t*>(&clone[4 * (x + 4)]), *reinterpret_cast<const int64_t*>(&clone[4 * (x + 5)]));
+                    __m128i a55 = _mm_setr_epi64x(*reinterpret_cast<const int64_t*>(&clone[4 * (x + 5)]), *reinterpret_cast<const int64_t*>(&clone[4 * (x + 5)]));
+                    __m128i a66 = _mm_setr_epi64x(*reinterpret_cast<const int64_t*>(&clone[4 * (x + 6)]), *reinterpret_cast<const int64_t*>(&clone[4 * (x + 6)]));
+                    __m128i a67 = _mm_setr_epi64x(*reinterpret_cast<const int64_t*>(&clone[4 * (x + 6)]), *reinterpret_cast<const int64_t*>(&clone[4 * (x + 7)]));
+                    __m128i a77 = _mm_setr_epi64x(*reinterpret_cast<const int64_t*>(&clone[4 * (x + 7)]), *reinterpret_cast<const int64_t*>(&clone[4 * (x + 7)]));
+                    __m128i a88 = _mm_setr_epi64x(*reinterpret_cast<const int64_t*>(&clone[4 * (x + 8)]), *reinterpret_cast<const int64_t*>(&clone[4 * (x + 8)]));
+                    __m128i a89 = _mm_setr_epi64x(*reinterpret_cast<const int64_t*>(&clone[4 * (x + 8)]), *reinterpret_cast<const int64_t*>(&clone[4 * (x + 9)]));
+                    __m128i a99 = _mm_setr_epi64x(*reinterpret_cast<const int64_t*>(&clone[4 * (x + 9)]), *reinterpret_cast<const int64_t*>(&clone[4 * (x + 9)]));
+                    __m128i a1010 = _mm_setr_epi64x(*reinterpret_cast<const int64_t*>(&clone[4 * (x + 10)]), *reinterpret_cast<const int64_t*>(&clone[4 * (x + 10)]));
+                    __m128i a1011 = _mm_setr_epi64x(*reinterpret_cast<const int64_t*>(&clone[4 * (x + 10)]), *reinterpret_cast<const int64_t*>(&clone[4 * (x + 11)]));
+                    __m128i a1111 = _mm_setr_epi64x(*reinterpret_cast<const int64_t*>(&clone[4 * (x + 11)]), *reinterpret_cast<const int64_t*>(&clone[4 * (x + 11)]));
+                    __m128i a1212 = _mm_setr_epi64x(*reinterpret_cast<const int64_t*>(&clone[4 * (x + 12)]), *reinterpret_cast<const int64_t*>(&clone[4 * (x + 12)]));
+                    __m128i a1213 = _mm_setr_epi64x(*reinterpret_cast<const int64_t*>(&clone[4 * (x + 12)]), *reinterpret_cast<const int64_t*>(&clone[4 * (x + 13)]));
+                    __m128i a1313 = _mm_setr_epi64x(*reinterpret_cast<const int64_t*>(&clone[4 * (x + 13)]), *reinterpret_cast<const int64_t*>(&clone[4 * (x + 13)]));
+                    __m128i a1414 = _mm_setr_epi64x(*reinterpret_cast<const int64_t*>(&clone[4 * (x + 14)]), *reinterpret_cast<const int64_t*>(&clone[4 * (x + 14)]));
+                    __m128i a1415 = _mm_setr_epi64x(*reinterpret_cast<const int64_t*>(&clone[4 * (x + 14)]), *reinterpret_cast<const int64_t*>(&clone[4 * (x + 15)]));
+                    __m128i a1515 = _mm_setr_epi64x(*reinterpret_cast<const int64_t*>(&clone[4 * (x + 15)]), *reinterpret_cast<const int64_t*>(&clone[4 * (x + 15)]));
+#else
+                    __m128i a00 = _mm_setr_epi64(*reinterpret_cast<const __m64*>(&clone[4 * x]), *reinterpret_cast<const __m64*>(&clone[4 * x]));
+                    __m128i a01 = _mm_setr_epi64(*reinterpret_cast<const __m64*>(&clone[4 * x]), *reinterpret_cast<const __m64*>(&clone[4 * (x + 1)]));
+                    __m128i a11 = _mm_setr_epi64(*reinterpret_cast<const __m64*>(&clone[4 * (x + 1)]), *reinterpret_cast<const __m64*>(&clone[4 * (x + 1)]));
+                    __m128i a22 = _mm_setr_epi64(*reinterpret_cast<const __m64*>(&clone[4 * (x + 2)]), *reinterpret_cast<const __m64*>(&clone[4 * (x + 2)]));
+                    __m128i a23 = _mm_setr_epi64(*reinterpret_cast<const __m64*>(&clone[4 * (x + 2)]), *reinterpret_cast<const __m64*>(&clone[4 * (x + 3)]));
+                    __m128i a33 = _mm_setr_epi64(*reinterpret_cast<const __m64*>(&clone[4 * (x + 3)]), *reinterpret_cast<const __m64*>(&clone[4 * (x + 3)]));
+                    __m128i a44 = _mm_setr_epi64(*reinterpret_cast<const __m64*>(&clone[4 * (x + 4)]), *reinterpret_cast<const __m64*>(&clone[4 * (x + 4)]));
+                    __m128i a45 = _mm_setr_epi64(*reinterpret_cast<const __m64*>(&clone[4 * (x + 4)]), *reinterpret_cast<const __m64*>(&clone[4 * (x + 5)]));
+                    __m128i a55 = _mm_setr_epi64(*reinterpret_cast<const __m64*>(&clone[4 * (x + 5)]), *reinterpret_cast<const __m64*>(&clone[4 * (x + 5)]));
+                    __m128i a66 = _mm_setr_epi64(*reinterpret_cast<const __m64*>(&clone[4 * (x + 6)]), *reinterpret_cast<const __m64*>(&clone[4 * (x + 6)]));
+                    __m128i a67 = _mm_setr_epi64(*reinterpret_cast<const __m64*>(&clone[4 * (x + 6)]), *reinterpret_cast<const __m64*>(&clone[4 * (x + 7)]));
+                    __m128i a77 = _mm_setr_epi64(*reinterpret_cast<const __m64*>(&clone[4 * (x + 7)]), *reinterpret_cast<const __m64*>(&clone[4 * (x + 7)]));
+                    __m128i a88 = _mm_setr_epi64(*reinterpret_cast<const __m64*>(&clone[4 * (x + 8)]), *reinterpret_cast<const __m64*>(&clone[4 * (x + 8)]));
+                    __m128i a89 = _mm_setr_epi64(*reinterpret_cast<const __m64*>(&clone[4 * (x + 8)]), *reinterpret_cast<const __m64*>(&clone[4 * (x + 9)]));
+                    __m128i a99 = _mm_setr_epi64(*reinterpret_cast<const __m64*>(&clone[4 * (x + 9)]), *reinterpret_cast<const __m64*>(&clone[4 * (x + 9)]));
+                    __m128i a1010 = _mm_setr_epi64(*reinterpret_cast<const __m64*>(&clone[4 * (x + 10)]), *reinterpret_cast<const __m64*>(&clone[4 * (x + 10)]));
+                    __m128i a1011 = _mm_setr_epi64(*reinterpret_cast<const __m64*>(&clone[4 * (x + 10)]), *reinterpret_cast<const __m64*>(&clone[4 * (x + 11)]));
+                    __m128i a1111 = _mm_setr_epi64(*reinterpret_cast<const __m64*>(&clone[4 * (x + 11)]), *reinterpret_cast<const __m64*>(&clone[4 * (x + 11)]));
+                    __m128i a1212 = _mm_setr_epi64(*reinterpret_cast<const __m64*>(&clone[4 * (x + 12)]), *reinterpret_cast<const __m64*>(&clone[4 * (x + 12)]));
+                    __m128i a1213 = _mm_setr_epi64(*reinterpret_cast<const __m64*>(&clone[4 * (x + 12)]), *reinterpret_cast<const __m64*>(&clone[4 * (x + 13)]));
+                    __m128i a1313 = _mm_setr_epi64(*reinterpret_cast<const __m64*>(&clone[4 * (x + 13)]), *reinterpret_cast<const __m64*>(&clone[4 * (x + 13)]));
+                    __m128i a1414 = _mm_setr_epi64(*reinterpret_cast<const __m64*>(&clone[4 * (x + 14)]), *reinterpret_cast<const __m64*>(&clone[4 * (x + 14)]));
+                    __m128i a1415 = _mm_setr_epi64(*reinterpret_cast<const __m64*>(&clone[4 * (x + 14)]), *reinterpret_cast<const __m64*>(&clone[4 * (x + 15)]));
+                    __m128i a1515 = _mm_setr_epi64(*reinterpret_cast<const __m64*>(&clone[4 * (x + 15)]), *reinterpret_cast<const __m64*>(&clone[4 * (x + 15)]));
+#endif
+
+                    // load 3 channels of first pixel from first pair of 4-couple scope
+                    __m128i pix1 = _mm_lddqu_si128(reinterpret_cast<const __m128i*>(&tmp[4 * (chanNum * mapsx[x])]));
+                    // insert first channel from next couple of pixels to completely fill the simd vector
+                    pix1 = _mm_insert_epi32(pix1, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * mapsx[x + 1])]), 3);
+
+                    // load 3 channels of neighbor pixel from first pair of 4-couple scope
+                    __m128i pix2 = _mm_lddqu_si128(reinterpret_cast<const __m128i*>(&tmp[4 * (chanNum * (mapsx[x] + 1))]));
+                    // insert first channel from next couple of pixels to completely fill the simd vector
+                    pix2 = _mm_insert_epi32(pix2, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + 1] + 1))]), 3);
+
+                    // expand 8-bit data to 16-bit
+                    __m128i val_0 = _mm_unpacklo_epi8(pix1, zero);
+                    __m128i val_1 = _mm_unpacklo_epi8(pix2, zero);
+
+                    // expand 8-bit data to 16-bit
+                    __m128i val_2 = _mm_unpackhi_epi8(pix1, zero);
+                    __m128i val_3 = _mm_unpackhi_epi8(pix2, zero);
+
+                    // the main calculations
+                    __m128i t0_0 = _mm_mulhrs_epi16(_mm_sub_epi16(val_0, val_1), a00);
+                    __m128i t1_0 = _mm_mulhrs_epi16(_mm_sub_epi16(val_2, val_3), a01);
+                    __m128i r0_0 = _mm_add_epi16(val_1, t0_0);
+                    __m128i r1_0 = _mm_add_epi16(val_3, t1_0);
+
+                    // pack 16-bit data to 8-bit
+                    __m128i q0_0 = _mm_packus_epi16(r0_0, r1_0);
+                    // gather data from the same lines together
+                    __m128i res1 = _mm_shuffle_epi8(q0_0, horizontal_shuf_mask);
+
+                    val_0 = _mm_unpacklo_epi8(_mm_insert_epi64(val_0, *reinterpret_cast<const int64_t*>(&tmp[4 * (chanNum * mapsx[x + 1] + 1)]), 0), zero);
+                    val_1 = _mm_unpacklo_epi8(_mm_insert_epi64(val_1, *reinterpret_cast<const int64_t*>(&tmp[4 * (chanNum * (mapsx[x + 1] + 1) + 1)]), 0), zero);
+
+                    val_2 = _mm_insert_epi64(val_2, *reinterpret_cast<const int64_t*>(&tmp[4 * (chanNum * mapsx[x + 2])]), 0);
+                    val_3 = _mm_insert_epi64(val_3, *reinterpret_cast<const int64_t*>(&tmp[4 * (chanNum * (mapsx[x + 2] + 1))]), 0);
+
+                    val_2 = _mm_unpacklo_epi8(val_2, zero);
+                    val_3 = _mm_unpacklo_epi8(val_3, zero);
+
+                    __m128i t0_1 = _mm_mulhrs_epi16(_mm_sub_epi16(val_0, val_1), a11);
+                    __m128i t1_1 = _mm_mulhrs_epi16(_mm_sub_epi16(val_2, val_3), a22);
+                    __m128i r0_1 = _mm_add_epi16(val_1, t0_1);
+                    __m128i r1_1 = _mm_add_epi16(val_3, t1_1);
+
+                    __m128i q0_1 = _mm_packus_epi16(r0_1, r1_1);
+                    __m128i res2 = _mm_shuffle_epi8(q0_1, horizontal_shuf_mask);
+
+                    __m128i pix7 = _mm_lddqu_si128(reinterpret_cast<const __m128i*>(&tmp[4 * (chanNum * (mapsx[x + 3] - 1) + 2)]));
+                    pix7 = _mm_insert_epi32(pix7, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * mapsx[x + 2] + 2)]), 0);
+
+                    __m128i pix8 = _mm_lddqu_si128(reinterpret_cast<const __m128i*>(&tmp[4 * (chanNum * mapsx[x + 3] + 2)]));
+                    pix8 = _mm_insert_epi32(pix8, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + 2] + 1) + 2)]), 0);
+
+                    val_0 = _mm_unpacklo_epi8(pix7, zero);
+                    val_1 = _mm_unpacklo_epi8(pix8, zero);
+
+                    val_2 = _mm_unpackhi_epi8(pix7, zero);
+                    val_3 = _mm_unpackhi_epi8(pix8, zero);
+
+                    // the main calculations
+                    __m128i t0_2 = _mm_mulhrs_epi16(_mm_sub_epi16(val_0, val_1), a23);
+                    __m128i t1_2 = _mm_mulhrs_epi16(_mm_sub_epi16(val_2, val_3), a33);
+                    __m128i r0_2 = _mm_add_epi16(val_1, t0_2);
+                    __m128i r1_2 = _mm_add_epi16(val_3, t1_2);
+
+                    // pack 16-bit data to 8-bit
+                    __m128i q0_2 = _mm_packus_epi16(r0_2, r1_2);
+                    __m128i res3 = _mm_shuffle_epi8(q0_2, horizontal_shuf_mask);
+
+                    __m128i pix9 = _mm_lddqu_si128(reinterpret_cast<const __m128i*>(&tmp[4 * (chanNum * mapsx[x + 4])]));
+                    // insert first channel from next couple of pixels to completely fill the simd vector
+                    pix9 = _mm_insert_epi32(pix9, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * mapsx[x + 5])]), 3);
+
+                    // load 3 channels of neighbor pixel from first pair of 4-couple scope
+                    __m128i pix10 = _mm_lddqu_si128(reinterpret_cast<const __m128i*>(&tmp[4 * (chanNum * (mapsx[x + 4] + 1))]));
+                    // insert first channel from next couple of pixels to completely fill the simd vector
+                    pix10 = _mm_insert_epi32(pix10, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + 5] + 1))]), 3);
+
+                    // expand 8-bit data to 16-bit
+                    val_0 = _mm_unpacklo_epi8(pix9, zero);
+                    val_1 = _mm_unpacklo_epi8(pix10, zero);
+
+                    // expand 8-bit data to 16-bit
+                    val_2 = _mm_unpackhi_epi8(pix9, zero);
+                    val_3 = _mm_unpackhi_epi8(pix10, zero);
+
+                    // the main calculations
+                    __m128i t0_3 = _mm_mulhrs_epi16(_mm_sub_epi16(val_0, val_1), a44);
+                    __m128i t1_3 = _mm_mulhrs_epi16(_mm_sub_epi16(val_2, val_3), a45);
+                    __m128i r0_3 = _mm_add_epi16(val_1, t0_3);
+                    __m128i r1_3 = _mm_add_epi16(val_3, t1_3);
+
+                    // pack 16-bit data to 8-bit
+                    __m128i q0_3 = _mm_packus_epi16(r0_3, r1_3);
+                    // gather data from the same lines together
+                    __m128i res4 = _mm_shuffle_epi8(q0_3, horizontal_shuf_mask);
+
+                    val_0 = _mm_unpacklo_epi8(_mm_insert_epi64(val_0, *reinterpret_cast<const int64_t*>(&tmp[4 * (chanNum *  mapsx[x + 5]      + 1)]), 0), zero);
+                    val_1 = _mm_unpacklo_epi8(_mm_insert_epi64(val_1, *reinterpret_cast<const int64_t*>(&tmp[4 * (chanNum * (mapsx[x + 5] + 1) + 1)]), 0), zero);
+
+                    val_2 = _mm_insert_epi64(val_2, *reinterpret_cast<const int64_t*>(&tmp[4 * (chanNum * mapsx[x + 6])]), 0);
+                    val_3 = _mm_insert_epi64(val_3, *reinterpret_cast<const int64_t*>(&tmp[4 * (chanNum * (mapsx[x + 6] + 1))]), 0);
+
+                    val_2 = _mm_unpacklo_epi8(val_2, zero);
+                    val_3 = _mm_unpacklo_epi8(val_3, zero);
+
+                    __m128i t0_4 = _mm_mulhrs_epi16(_mm_sub_epi16(val_0, val_1), a55);
+                    __m128i t1_4 = _mm_mulhrs_epi16(_mm_sub_epi16(val_2, val_3), a66);
+                    __m128i r0_4 = _mm_add_epi16(val_1, t0_4);
+                    __m128i r1_4 = _mm_add_epi16(val_3, t1_4);
+
+                    __m128i q0_4 = _mm_packus_epi16(r0_4, r1_4);
+                    __m128i res5 = _mm_shuffle_epi8(q0_4, horizontal_shuf_mask);
+
+                    __m128i pix15 = _mm_lddqu_si128(reinterpret_cast<const __m128i*>(&tmp[4 * (chanNum * (mapsx[x + 7] - 1) + 2)]));
+                    pix15 = _mm_insert_epi32(pix15, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * mapsx[x + 6] + 2)]), 0);
+
+                    __m128i pix16 = _mm_lddqu_si128(reinterpret_cast<const __m128i*>(&tmp[4 * (chanNum * mapsx[x + 7]   + 2)]));
+                    pix16 = _mm_insert_epi32(pix16, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + 6] + 1) + 2)]), 0);
+
+                    val_0 = _mm_unpacklo_epi8(pix15, zero);
+                    val_1 = _mm_unpacklo_epi8(pix16, zero);
+
+                    val_2 = _mm_unpackhi_epi8(pix15, zero);
+                    val_3 = _mm_unpackhi_epi8(pix16, zero);
+
+                    // the main calculations
+                    __m128i t0_5 = _mm_mulhrs_epi16(_mm_sub_epi16(val_0, val_1), a67);
+                    __m128i t1_5 = _mm_mulhrs_epi16(_mm_sub_epi16(val_2, val_3), a77);
+                    __m128i r0_5 = _mm_add_epi16(val_1, t0_5);
+                    __m128i r1_5 = _mm_add_epi16(val_3, t1_5);
+
+                    // pack 16-bit data to 8-bit
+                    __m128i q0_5 = _mm_packus_epi16(r0_5, r1_5);
+                    __m128i res6 = _mm_shuffle_epi8(q0_5, horizontal_shuf_mask);
+
+                    __m128i bl1 = _mm_blend_epi16(res1, _mm_slli_si128(res2, 4), 0xCC /*0b11001100*/);
+                    __m128i bl2 = _mm_blend_epi16(_mm_srli_si128(res1, 4), res2, 0xCC /*0b11001100*/);
+
+                    __m128i bl3 = _mm_blend_epi16(res3, _mm_slli_si128(res4, 4), 0xCC /*0b11001100*/);
+                    __m128i bl4 = _mm_blend_epi16(_mm_srli_si128(res3, 4), res4, 0xCC /*0b11001100*/);
+
+                    __m128i bl5 = _mm_blend_epi16(res5, _mm_slli_si128(res6, 4), 0xCC /*0b11001100*/);
+                    __m128i bl6 = _mm_blend_epi16(_mm_srli_si128(res5, 4), res6, 0xCC /*0b11001100*/);
+
+                    __m128i bl13 = _mm_blend_epi16(bl1, _mm_slli_si128(bl3, 8), 0xF0 /*0b11110000*/);
+                    __m128i bl31 = _mm_blend_epi16(_mm_srli_si128(bl1, 8), bl3, 0xF0 /*0b11110000*/);
+
+                    __m128i bl24 = _mm_blend_epi16(bl2, _mm_slli_si128(bl4, 8), 0xF0 /*0b11110000*/);
+                    __m128i bl42 = _mm_blend_epi16(_mm_srli_si128(bl2, 8), bl4, 0xF0 /*0b11110000*/);
+
+                    // load 3 channels of first pixel from first pair of 4-couple scope
+                    __m128i pix17 = _mm_lddqu_si128(reinterpret_cast<const __m128i*>(&tmp[4 * (chanNum * mapsx[x + 8])]));
+                    // insert first channel from next couple of pixels to completely fill the simd vector
+                    pix17 = _mm_insert_epi32(pix17, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * mapsx[x + 9])]), 3);
+
+                    // load 3 channels of neighbor pixel from first pair of 4-couple scope
+                    __m128i pix18 = _mm_lddqu_si128(reinterpret_cast<const __m128i*>(&tmp[4 * (chanNum * (mapsx[x + 8] + 1))]));
+                    // insert first channel from next couple of pixels to completely fill the simd vector
+                    pix18 = _mm_insert_epi32(pix18, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + 9] + 1))]), 3);
+
+                    // expand 8-bit data to 16-bit
+                    val_0 = _mm_unpacklo_epi8(pix17, zero);
+                    val_1 = _mm_unpacklo_epi8(pix18, zero);
+
+                    // expand 8-bit data to 16-bit
+                    val_2 = _mm_unpackhi_epi8(pix17, zero);
+                    val_3 = _mm_unpackhi_epi8(pix18, zero);
+
+                    // the main calculations
+                    __m128i t0_6 = _mm_mulhrs_epi16(_mm_sub_epi16(val_0, val_1), a88);
+                    __m128i t1_6 = _mm_mulhrs_epi16(_mm_sub_epi16(val_2, val_3), a89);
+                    __m128i r0_6 = _mm_add_epi16(val_1, t0_6);
+                    __m128i r1_6 = _mm_add_epi16(val_3, t1_6);
+
+                    // pack 16-bit data to 8-bit
+                    __m128i q0_6 = _mm_packus_epi16(r0_6, r1_6);
+                    // gather data from the same lines together
+                    __m128i res7 = _mm_shuffle_epi8(q0_6, horizontal_shuf_mask);
+
+                    val_0 = _mm_unpacklo_epi8(_mm_insert_epi64(val_0, *reinterpret_cast<const int64_t*>(&tmp[4 * (chanNum * mapsx[x + 9] + 1)]), 0), zero);
+                    val_1 = _mm_unpacklo_epi8(_mm_insert_epi64(val_1, *reinterpret_cast<const int64_t*>(&tmp[4 * (chanNum * (mapsx[x + 9] + 1) + 1)]), 0), zero);
+
+                    val_2 = _mm_insert_epi64(val_2, *reinterpret_cast<const int64_t*>(&tmp[4 * (chanNum * mapsx[x + 10])]), 0);
+                    val_3 = _mm_insert_epi64(val_3, *reinterpret_cast<const int64_t*>(&tmp[4 * (chanNum * (mapsx[x + 10] + 1))]), 0);
+
+                    val_2 = _mm_unpacklo_epi8(val_2, zero);
+                    val_3 = _mm_unpacklo_epi8(val_3, zero);
+
+                    __m128i t0_7 = _mm_mulhrs_epi16(_mm_sub_epi16(val_0, val_1), a99);
+                    __m128i t1_7 = _mm_mulhrs_epi16(_mm_sub_epi16(val_2, val_3), a1010);
+                    __m128i r0_7 = _mm_add_epi16(val_1, t0_7);
+                    __m128i r1_7 = _mm_add_epi16(val_3, t1_7);
+
+                    __m128i q0_7 = _mm_packus_epi16(r0_7, r1_7);
+                    __m128i res8 = _mm_shuffle_epi8(q0_7, horizontal_shuf_mask);
+
+                    __m128i pix21 = _mm_lddqu_si128(reinterpret_cast<const __m128i*>(&tmp[4 * (chanNum * (mapsx[x + 11] - 1) + 2)]));
+                    pix21 = _mm_insert_epi32(pix21, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * mapsx[x + 10] + 2)]), 0);
+
+                    __m128i pix22 = _mm_lddqu_si128(reinterpret_cast<const __m128i*>(&tmp[4 * (chanNum * mapsx[x + 11] + 2)]));
+                    pix22 = _mm_insert_epi32(pix22, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + 10] + 1) + 2)]), 0);
+
+                    val_0 = _mm_unpacklo_epi8(pix21, zero);
+                    val_1 = _mm_unpacklo_epi8(pix22, zero);
+
+                    val_2 = _mm_unpackhi_epi8(pix21, zero);
+                    val_3 = _mm_unpackhi_epi8(pix22, zero);
+
+                    // the main calculations
+                    __m128i t0_8 = _mm_mulhrs_epi16(_mm_sub_epi16(val_0, val_1), a1011);
+                    __m128i t1_8 = _mm_mulhrs_epi16(_mm_sub_epi16(val_2, val_3), a1111);
+                    __m128i r0_8 = _mm_add_epi16(val_1, t0_8);
+                    __m128i r1_8 = _mm_add_epi16(val_3, t1_8);
+
+                    // pack 16-bit data to 8-bit
+                    __m128i q0_8 = _mm_packus_epi16(r0_8, r1_8);
+                    __m128i res9 = _mm_shuffle_epi8(q0_8, horizontal_shuf_mask);
+
+                    __m128i pix23 = _mm_lddqu_si128(reinterpret_cast<const __m128i*>(&tmp[4 * (chanNum * mapsx[x + 12])]));
+                    // insert first channel from next couple of pixels to completely fill the simd vector
+                    pix23 = _mm_insert_epi32(pix23, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * mapsx[x + 13])]), 3);
+
+                    // load 3 channels of neighbor pixel from first pair of 4-couple scope
+                    __m128i pix24 = _mm_lddqu_si128(reinterpret_cast<const __m128i*>(&tmp[4 * (chanNum * (mapsx[x + 12] + 1))]));
+                    // insert first channel from next couple of pixels to completely fill the simd vector
+                    pix24 = _mm_insert_epi32(pix24, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + 13] + 1))]), 3);
+
+                    // expand 8-bit data to 16-bit
+                    val_0 = _mm_unpacklo_epi8(pix23, zero);
+                    val_1 = _mm_unpacklo_epi8(pix24, zero);
+
+                    // expand 8-bit data to 16-bit
+                    val_2 = _mm_unpackhi_epi8(pix23, zero);
+                    val_3 = _mm_unpackhi_epi8(pix24, zero);
+
+                    // the main calculations
+                    __m128i t0_9 = _mm_mulhrs_epi16(_mm_sub_epi16(val_0, val_1), a1212);
+                    __m128i t1_9 = _mm_mulhrs_epi16(_mm_sub_epi16(val_2, val_3), a1213);
+                    __m128i r0_9 = _mm_add_epi16(val_1, t0_9);
+                    __m128i r1_9 = _mm_add_epi16(val_3, t1_9);
+
+                    // pack 16-bit data to 8-bit
+                    __m128i q0_9 = _mm_packus_epi16(r0_9, r1_9);
+                    // gather data from the same lines together
+                    __m128i res10 = _mm_shuffle_epi8(q0_9, horizontal_shuf_mask);
+
+                    val_0 = _mm_unpacklo_epi8(_mm_insert_epi64(val_0, *reinterpret_cast<const int64_t*>(&tmp[4 * (chanNum * mapsx[x + 13] + 1)]), 0), zero);
+                    val_1 = _mm_unpacklo_epi8(_mm_insert_epi64(val_1, *reinterpret_cast<const int64_t*>(&tmp[4 * (chanNum * (mapsx[x + 13] + 1) + 1)]), 0), zero);
+
+                    val_2 = _mm_insert_epi64(val_2, *reinterpret_cast<const int64_t*>(&tmp[4 * (chanNum * mapsx[x + 14])]), 0);
+                    val_3 = _mm_insert_epi64(val_3, *reinterpret_cast<const int64_t*>(&tmp[4 * (chanNum * (mapsx[x + 14] + 1))]), 0);
+
+                    val_2 = _mm_unpacklo_epi8(val_2, zero);
+                    val_3 = _mm_unpacklo_epi8(val_3, zero);
+
+                    __m128i t0_10 = _mm_mulhrs_epi16(_mm_sub_epi16(val_0, val_1), a1313);
+                    __m128i t1_10 = _mm_mulhrs_epi16(_mm_sub_epi16(val_2, val_3), a1414);
+                    __m128i r0_10 = _mm_add_epi16(val_1, t0_10);
+                    __m128i r1_10 = _mm_add_epi16(val_3, t1_10);
+
+                    __m128i q0_10 = _mm_packus_epi16(r0_10, r1_10);
+                    __m128i res11 = _mm_shuffle_epi8(q0_10, horizontal_shuf_mask);
+
+                    __m128i pix27 = _mm_lddqu_si128(reinterpret_cast<const __m128i*>(&tmp[4 * (chanNum * (mapsx[x + 15] - 1) + 2)]));
+                    pix27 = _mm_insert_epi32(pix27, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * mapsx[x + 14] + 2)]), 0);
+
+                    __m128i pix28 = _mm_lddqu_si128(reinterpret_cast<const __m128i*>(&tmp[4 * (chanNum * mapsx[x + 15] + 2)]));
+                    pix28 = _mm_insert_epi32(pix28, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + 14] + 1) + 2)]), 0);
+
+                    val_0 = _mm_unpacklo_epi8(pix27, zero);
+                    val_1 = _mm_unpacklo_epi8(pix28, zero);
+
+                    val_2 = _mm_unpackhi_epi8(pix27, zero);
+                    val_3 = _mm_unpackhi_epi8(pix28, zero);
+
+                    // the main calculations
+                    __m128i t0_11 = _mm_mulhrs_epi16(_mm_sub_epi16(val_0, val_1), a1415);
+                    __m128i t1_11 = _mm_mulhrs_epi16(_mm_sub_epi16(val_2, val_3), a1515);
+                    __m128i r0_11 = _mm_add_epi16(val_1, t0_11);
+                    __m128i r1_11 = _mm_add_epi16(val_3, t1_11);
+
+                    // pack 16-bit data to 8-bit
+                    __m128i q0_11 = _mm_packus_epi16(r0_11, r1_11);
+                    __m128i res12 = _mm_shuffle_epi8(q0_11, horizontal_shuf_mask);
+
+                    __m128i bl7 = _mm_blend_epi16(res7, _mm_slli_si128(res8, 4), 0xCC /*0b11001100*/);
+                    __m128i bl8 = _mm_blend_epi16(_mm_srli_si128(res7, 4), res8, 0xCC /*0b11001100*/);
+
+                    __m128i bl9 = _mm_blend_epi16(res9, _mm_slli_si128(res10, 4), 0xCC /*0b11001100*/);
+                    __m128i bl10 = _mm_blend_epi16(_mm_srli_si128(res9, 4), res10, 0xCC /*0b11001100*/);
+
+                    __m128i bl11 = _mm_blend_epi16(res11, _mm_slli_si128(res12, 4), 0xCC /*0b11001100*/);
+                    __m128i bl12 = _mm_blend_epi16(_mm_srli_si128(res11, 4), res12, 0xCC /*0b11001100*/);
+
+                    __m128i bl57 = _mm_blend_epi16(bl5, _mm_slli_si128(bl7, 8), 0xF0 /*0b11110000*/);
+                    __m128i bl75 = _mm_blend_epi16(_mm_srli_si128(bl5, 8), bl7, 0xF0 /*0b11110000*/);
+
+                    __m128i bl68 = _mm_blend_epi16(bl6, _mm_slli_si128(bl8, 8), 0xF0 /*0b11110000*/);
+                    __m128i bl86 = _mm_blend_epi16(_mm_srli_si128(bl6, 8), bl8, 0xF0 /*0b11110000*/);
+
+                    __m128i bl911 = _mm_blend_epi16(bl9, _mm_slli_si128(bl11, 8), 0xF0 /*0b11110000*/);
+                    __m128i bl119 = _mm_blend_epi16(_mm_srli_si128(bl9, 8), bl11, 0xF0 /*0b11110000*/);
+
+                    __m128i bl1012 = _mm_blend_epi16(bl10, _mm_slli_si128(bl12, 8), 0xF0 /*0b11110000*/);
+                    __m128i bl1210 = _mm_blend_epi16(_mm_srli_si128(bl10, 8), bl12, 0xF0 /*0b11110000*/);
+
+                    _mm_storeu_si128(reinterpret_cast<__m128i*>(&dst[0][3 * x]), bl13);
+                    _mm_storeu_si128(reinterpret_cast<__m128i*>(&dst[1][3 * x]), bl24);
+                    _mm_storeu_si128(reinterpret_cast<__m128i*>(&dst[2][3 * x]), bl31);
+                    _mm_storeu_si128(reinterpret_cast<__m128i*>(&dst[3][3 * x]), bl42);
+                    _mm_storeu_si128(reinterpret_cast<__m128i*>(&dst[0][3 * x + 16]), bl57);
+                    _mm_storeu_si128(reinterpret_cast<__m128i*>(&dst[1][3 * x + 16]), bl68);
+                    _mm_storeu_si128(reinterpret_cast<__m128i*>(&dst[2][3 * x + 16]), bl75);
+                    _mm_storeu_si128(reinterpret_cast<__m128i*>(&dst[3][3 * x + 16]), bl86);
+                    _mm_storeu_si128(reinterpret_cast<__m128i*>(&dst[0][3 * x + 32]), bl911);
+                    _mm_storeu_si128(reinterpret_cast<__m128i*>(&dst[1][3 * x + 32]), bl1012);
+                    _mm_storeu_si128(reinterpret_cast<__m128i*>(&dst[2][3 * x + 32]), bl119);
+                    _mm_storeu_si128(reinterpret_cast<__m128i*>(&dst[3][3 * x + 32]), bl1210);
+                }
+
+                if (x < outSz.width) {
+                    x = outSz.width - nlanes;
+                    continue;
+                }
+                break;
+            }
+        }
+        else
+        {  // if any lpi
+            for (int l = 0; l < lpi; ++l) {
+                short beta0 = beta[l];
+                const uchar* s0 = src0[l];
+                const uchar* s1 = src1[l];
+
+                // vertical pass
+                resize_vertical_anyLPI(s0, s1, tmp, inLength, beta0);
+
+                // horizontal pass
+                resize_horizontal_anyLPI(dst[l], tmp, mapsx, alpha, outSz.width);
+            }
+        }
+    } else if (!xRatioEq) {
+        GAPI_DbgAssert(yRatioEq);
+
+        for (int l = 0; l < lpi; ++l) {
+            const uchar* src = src0[l];
+
+            // horizontal pass
+            resize_horizontal_anyLPI(dst[l], src, mapsx, alpha, outSz.width);
+        }
+    } else if (!yRatioEq) {
+        GAPI_DbgAssert(xRatioEq);
+        int inLength = inSz.width*chanNum;  // == outSz.width
+
+        for (int l = 0; l < lpi; ++l) {
+            short beta0 = beta[l];
+            const uchar* s0 = src0[l];
+            const uchar* s1 = src1[l];
+
+            // vertical pass
+            resize_vertical_anyLPI(s0, s1, dst[l], inLength, beta0);
+        }
+    } else {
+        GAPI_DbgAssert(xRatioEq && yRatioEq);
+        int length = inSz.width *chanNum;
+
+        for (int l = 0; l < lpi; ++l) {
+            memcpy(dst[l], src0[l], length);
+        }
+    }
+}
+} // namespace sse42
+} // namespace fliud
+} // namespace gapi
+} // namespace cv
+#endif // !defined(GAPI_STANDALONE)
--- a/modules/gapi/test/cpu/gapi_core_tests_fluid.cpp
+++ b/modules/gapi/test/cpu/gapi_core_tests_fluid.cpp
@ -393,7 +393,7 @@ INSTANTIATE_TEST_CASE_P(ResizeTestFluid, ResizeTest,
                                       cv::Size(30, 30)),
                                Values(-1),
                                Values(CORE_FLUID),
-                                Values(AbsExact().to_compare_obj()),
+                                Values(Tolerance_FloatRel_IntAbs(1e-5, 1).to_compare_obj()),
                                Values(/*cv::INTER_NEAREST,*/ cv::INTER_LINEAR/*, cv::INTER_AREA*/),
                                Values(cv::Size(1280, 720),
                                       cv::Size(640, 480),
@ -410,7 +410,7 @@ INSTANTIATE_TEST_CASE_P(ResizeTestFxFyFluid, ResizeTestFxFy,
                                       cv::Size(30, 30)),
                                Values(-1),
                                Values(CORE_FLUID),
-                                Values(AbsExact().to_compare_obj()),
+                                Values(Tolerance_FloatRel_IntAbs(1e-5, 1).to_compare_obj()),
                                Values(/*cv::INTER_NEAREST,*/ cv::INTER_LINEAR/*, cv::INTER_AREA*/),
                                Values(0.5, 1, 2),
                                Values(0.5, 1, 2)));
--- a/modules/gapi/test/gapi_fluid_resize_test.cpp
+++ b/modules/gapi/test/gapi_fluid_resize_test.cpp
@ -8,6 +8,7 @@
 #include "test_precomp.hpp"

 #include "gapi_fluid_test_kernels.hpp"
+#include "common/gapi_tests_common.hpp"

 namespace opencv_test
 {
@ -749,8 +750,7 @@ TEST_P(NV12PlusResizeTest, Test)
    cv::Mat rgb_mat;
    cv::cvtColor(in_mat, rgb_mat, cv::COLOR_YUV2RGB_NV12);
    cv::resize(rgb_mat, out_mat_ocv, out_sz, 0, 0, interp);
-
-    EXPECT_EQ(0, cvtest::norm(out_mat(roi), out_mat_ocv(roi), NORM_INF));
+    EXPECT_TRUE(Tolerance_FloatRel_IntAbs(1e-5, 1).to_compare_f()(out_mat(roi), out_mat_ocv(roi)));
 }

 INSTANTIATE_TEST_CASE_P(Fluid, NV12PlusResizeTest,
--- a/modules/gapi/test/internal/gapi_int_recompilation_test.cpp
+++ b/modules/gapi/test/internal/gapi_int_recompilation_test.cpp
@ -6,6 +6,7 @@


 #include "../test_precomp.hpp"
+#include "../common/gapi_tests_common.hpp"
 #include "api/gcomputation_priv.hpp"

 #include <opencv2/gapi/fluid/gfluidkernel.hpp>
@ -115,9 +116,10 @@ TEST(GComputationCompile, FluidReshapeResizeDownScale)
    cv::Mat cv_out_mat1, cv_out_mat2;
    cv::resize(in_mat1, cv_out_mat1, szOut);
    cv::resize(in_mat2, cv_out_mat2, szOut);
-
-    EXPECT_EQ(0, cvtest::norm(out_mat1, cv_out_mat1, NORM_INF));
-    EXPECT_EQ(0, cvtest::norm(out_mat2, cv_out_mat2, NORM_INF));
+    // Fluid's and OpenCV's resizes aren't bit exact.
+    // So 1 is here because it is max difference between them.
+    EXPECT_TRUE(Tolerance_FloatRel_IntAbs(1e-5, 1).to_compare_f()(out_mat1, cv_out_mat1));
+    EXPECT_TRUE(Tolerance_FloatRel_IntAbs(1e-5, 1).to_compare_f()(out_mat2, cv_out_mat2));
 }

 TEST(GComputationCompile, FluidReshapeSwitchToUpscaleFromDownscale)
@ -150,10 +152,11 @@ TEST(GComputationCompile, FluidReshapeSwitchToUpscaleFromDownscale)
    cv::resize(in_mat1, cv_out_mat1, szOut);
    cv::resize(in_mat2, cv_out_mat2, szOut);
    cv::resize(in_mat3, cv_out_mat3, szOut);
-
-    EXPECT_EQ(0, cvtest::norm(out_mat1, cv_out_mat1, NORM_INF));
-    EXPECT_EQ(0, cvtest::norm(out_mat2, cv_out_mat2, NORM_INF));
-    EXPECT_EQ(0, cvtest::norm(out_mat3, cv_out_mat3, NORM_INF));
+    // Fluid's and OpenCV's Resizes aren't bit exact.
+    // So 1 is here because it is max difference between them.
+    EXPECT_TRUE(Tolerance_FloatRel_IntAbs(1e-5, 1).to_compare_f()(out_mat1, cv_out_mat1));
+    EXPECT_TRUE(Tolerance_FloatRel_IntAbs(1e-5, 1).to_compare_f()(out_mat2, cv_out_mat2));
+    EXPECT_TRUE(Tolerance_FloatRel_IntAbs(1e-5, 1).to_compare_f()(out_mat3, cv_out_mat3));
 }

 TEST(GComputationCompile, ReshapeBlur)
@ -224,8 +227,9 @@ TEST(GComputationCompile, ReshapeRois)
        cv::Mat blur_mat, cv_out_mat;
        cv::blur(in_mat, blur_mat, kernelSize);
        cv::resize(blur_mat, cv_out_mat, szOut);
-
-        EXPECT_EQ(0, cvtest::norm(out_mat(roi), cv_out_mat(roi), NORM_INF));
+        // Fluid's and OpenCV's resizes aren't bit exact.
+        // So 1 is here because it is max difference between them.
+        EXPECT_TRUE(Tolerance_FloatRel_IntAbs(1e-5, 1).to_compare_f()(out_mat(roi), cv_out_mat(roi)));
    }
 }

--- a/modules/gapi/test/streaming/gapi_streaming_tests.cpp
+++ b/modules/gapi/test/streaming/gapi_streaming_tests.cpp
@ -353,7 +353,9 @@ TEST_P(GAPI_Streaming, SmokeTest_ConstInput_GMat)
        // With constant inputs, the stream is endless so
        // the blocking pull() should never return `false`.
        EXPECT_TRUE(ccomp.pull(cv::gout(out_mat_gapi)));
-        EXPECT_EQ(0, cvtest::norm(out_mat_gapi, out_mat_ocv, NORM_INF));
+        // Fluid's and OpenCV's Resizes aren't bit exact.
+        // So 1% is here because it is max difference between them.
+        EXPECT_TRUE(AbsSimilarPoints(0, 1).to_compare_f()(out_mat_gapi, out_mat_ocv));
    }

    EXPECT_TRUE(ccomp.running());
@ -405,7 +407,9 @@ TEST_P(GAPI_Streaming, SmokeTest_VideoInput_GMat)
        frames++;
        cv::Mat out_mat_ocv;
        opencv_ref(in_mat_gapi, out_mat_ocv);
-        EXPECT_EQ(0, cvtest::norm(out_mat_gapi, out_mat_ocv, NORM_INF));
+        // Fluid's and OpenCV's Resizes aren't bit exact.
+        // So 1% is here because it is max difference between them.
+        EXPECT_TRUE(AbsSimilarPoints(0, 1).to_compare_f()(out_mat_gapi, out_mat_ocv));
    }
    EXPECT_LT(0u, frames);
    EXPECT_FALSE(ccomp.running());