Merge pull request #17764 from alalek:issue_17762

5 years ago · 09f24a851e
parent bf8136eaa6 81e027eef7
commit 09f24a851e
4 changed files with 624 additions and 107 deletions
--- a/modules/dnn/perf/perf_layer.cpp
+++ b/modules/dnn/perf/perf_layer.cpp
@ -0,0 +1,95 @@
 // This file is part of OpenCV project.
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 #include "perf_precomp.hpp"
 #include <opencv2/dnn/shape_utils.hpp>
 namespace opencv_test {
 struct Layer_Slice : public TestBaseWithParam<tuple<Backend, Target> >
 {
    template<int DIMS>
    void test_slice(const int* inputShape, const int* begin, const int* end)
    {
        int backendId = get<0>(GetParam());
        int targetId = get<1>(GetParam());
        Mat input(DIMS, inputShape, CV_32FC1, Scalar::all(0));
        for (int i = 0; i < (int)input.total(); ++i)
            input.ptr<float>()[i] = (float)(i & 4095);
        std::vector<Range> range(DIMS);
        for (int i = 0; i < DIMS; ++i)
            range[i] = Range(begin[i], end[i]);
        Net net;
        LayerParams lp;
        lp.type = "Slice";
        lp.name = "testLayer";
        lp.set("begin", DictValue::arrayInt<int*>((int*)&begin[0], DIMS));
        lp.set("end", DictValue::arrayInt<int*>((int*)&end[0], DIMS));
        net.addLayerToPrev(lp.name, lp.type, lp);
        // warmup
        {
            net.setInput(input);
            net.setPreferableBackend(backendId);
            net.setPreferableTarget(targetId);
            Mat out = net.forward();
            EXPECT_GT(cv::norm(out, NORM_INF), 0);
 #if 0
            //normAssert(out, input(range));
            cout << input(range).clone().reshape(1, 1) << endl;
            cout << out.reshape(1, 1) << endl;
 #endif
        }
        TEST_CYCLE()
        {
            Mat res = net.forward();
        }
        SANITY_CHECK_NOTHING();
    }
 };
 PERF_TEST_P_(Layer_Slice, YOLOv4_tiny_1)
 {
    const int inputShape[4] = {1, 64, 104, 104};
    const int begin[] = {0, 32, 0, 0};
    const int end[] = {1, 64, 104, 104};
    test_slice<4>(inputShape, begin, end);
 }
 PERF_TEST_P_(Layer_Slice, YOLOv4_tiny_2)
 {
    const int inputShape[4] = {1, 128, 52, 52};
    const int begin[] = {0, 64, 0, 0};
    const int end[] = {1, 128, 52, 52};
    test_slice<4>(inputShape, begin, end);
 }
 PERF_TEST_P_(Layer_Slice, YOLOv4_tiny_3)
 {
    const int inputShape[4] = {1, 256, 26, 26};
    const int begin[] = {0, 128, 0, 0};
    const int end[] = {1, 256, 26, 26};
    test_slice<4>(inputShape, begin, end);
 }
 PERF_TEST_P_(Layer_Slice, FastNeuralStyle_eccv16)
 {
    const int inputShape[4] = {1, 128, 80, 100};
    const int begin[] = {0, 0, 2, 2};
    const int end[] = {1, 128, 76, 96};
    test_slice<4>(inputShape, begin, end);
 }
 INSTANTIATE_TEST_CASE_P(/**/, Layer_Slice, dnnBackendsAndTargets(false, false));
 } // namespace
--- a/modules/dnn/src/layers/slice_layer.cpp
+++ b/modules/dnn/src/layers/slice_layer.cpp
@ -47,6 +47,8 @@
 #include "layers_common.hpp"
 #include <opencv2/dnn/shape_utils.hpp>
 #include <opencv2/core/utils/logger.hpp>
 #ifdef HAVE_OPENCL
 #include "opencl_kernels_dnn.hpp"
 #endif
@ -197,58 +199,168 @@ public:
                finalSliceRanges[i][j] = clamp(finalSliceRanges[i][j], inpShape[j]);
            }
        }
 #if 0
        std::cout << "DEBUG: DNN/Slice: " << outputs.size() << " inpShape=" << inpShape << std::endl;
        for (int i = 0; i < outputs.size(); ++i)
        {
            for (int j = 0; j < finalSliceRanges[i].size(); ++j)
            {
                std::cout << finalSliceRanges[i][j];
            }
            std::cout << std::endl;
        }
 #endif
    }
 #ifdef HAVE_OPENCL
    bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
    {
 #if 1
        // TODO fix that (brokes YOLOv4-tiny)
        return false;
 #else
        std::vector<UMat> inputs;
        std::vector<UMat> outputs;
        bool use_half = (inputs_.depth() == CV_16S);
        inputs_.getUMatVector(inputs);
        outputs_.getUMatVector(outputs);
-        if (inputs[0].dims < 4 || (total(shape(outputs[0]), 0, 2) % 4 != 0) ||
+        CV_Assert(outputs.size() == finalSliceRanges.size());
-            (total(shape(outputs[0]), 2) % 4 != 0))
+
        const UMat& input = inputs[0];
        if (input.dims > 5)
        {
            CV_LOG_INFO(NULL, "DNN/OpenCL/Slice: implementation doesn't support dims=" << input.dims << ". Fallback to CPU");
            return false;
        }
-        String opts;
+        size_t WSZ = 128;
-        if (use_half)
+
-            opts = "-DDtype=half -DDtype4=half4 -DDtype8=half8";
+        const int dims = input.dims;
-        else
+        const int elemSize = (int)input.elemSize();
-            opts = "-DDtype=float -DDtype4=float4 -DDtype8=float8";
+        String opts0 = cv::format(
-        const UMat& inpMat = inputs[0];
+                "-DDIMS=%d -DELEMSIZE=%d",
                dims, elemSize
            );
        for (int d = 0; d < dims; d++)
        {
            opts0 += cv::format(" -DSRC_STEP_%d=%d", d, (int)input.step[dims - 1 - d]);
        }
        String kname = cv::format("slice_%d", dims);
        for (size_t i = 0; i < outputs.size(); i++)
        {
-            int groups = outputs[i].size[0];
+            UMat& output = outputs[i];
-            int channels = outputs[i].size[1];
+            const std::vector<Range>& range = finalSliceRanges[i];
-            int rows = outputs[i].size[2];
+
-            int cols = outputs[i].size[3];
+            String opts = opts0;
-
+
-            ocl::Kernel kernel("slice", ocl::dnn::slice_oclsrc, opts);
+            CV_CheckEQ(output.dims, dims, "");
-            size_t local[] = { 128 };
+            for (int d = 0; d < dims; d++)
-            size_t global[] = { (size_t)groups * channels / 4 * local[0] };
+            {
-            int idx = 0;
+                opts += cv::format(" -DDST_STEP_%d=%d -DDST_SZ_%d=%d -DSRC_START_%d=%d",
-            kernel.set(idx++, ocl::KernelArg::PtrReadOnly(inpMat));
+                        d, (int)output.step[dims - 1 - d],
-            kernel.set(idx++, (int)(inpMat.size[2] * inpMat.size[3]));
+                        d, (int)output.size[dims - 1 - d],
-            kernel.set(idx++, (int)(rows * cols));
+                        d, (int)range[dims - 1 - d].start
-            kernel.set(idx++, (int)inpMat.size[3]);
+                    );
-            kernel.set(idx++, (int)cols);
+                CV_CheckEQ(range[d].size(), (int)output.size[d], "");
-            kernel.set(idx++, (int)finalSliceRanges[i][2].start);
+            }
-            kernel.set(idx++, (int)finalSliceRanges[i][3].start);
+
-            kernel.set(idx++, ocl::KernelArg::PtrWriteOnly(outputs[i]));
+            int block_dims = 0;
-            bool ret = kernel.run(1, global, local, false);
+            size_t block_size = elemSize;
            for (int i = dims - 1; i >= 0; --i)
            {
                if (input.step[i] != output.step[i])
                    break;
                block_size *= output.size[i];
                block_dims++;
            }
            const size_t total = output.total() * elemSize;
            size_t num_blocks = total / block_size;
            if ((num_blocks <= 8 && block_size >= WSZ * 4) || (block_size >= WSZ * 64))
            {
                // use 1D copy mode
                opts += cv::format(" -DUSE_COPY_1D=1");
                opts += cv::format(" -DBLOCK_DIMS=%d", block_dims);
                opts += cv::format(" -DBLOCK_DIMS_CONTIGUOUS=%d", block_dims);
                opts += cv::format(" -DBLOCK_SIZE=%d", (int)block_size);
                opts += cv::format(" -DBLOCK_COLS=%d", (int)block_size);
            }
            else
            {
                // use 2D copy mode
                int block_cols = block_size;
                int block_dims_contiguous = block_dims;
                size_t input_base_step = input.step[dims - 1 - block_dims_contiguous];
                size_t output_base_step = output.step[dims - 1 - block_dims_contiguous];
                size_t block_rows = 1;
                for (int i = dims - 1 - block_dims_contiguous; i >= 0; --i)
                {
                    if (input.step[i] * output_base_step != output.step[i] * input_base_step)
                        break;
                    block_rows *= output.size[i];
                    block_dims++;
                }
                block_size *= block_rows;
                num_blocks = total / block_size;
                if (block_rows > 1)
                {
                    opts += cv::format(" -DBLOCK_DIMS=%d", block_dims);
                    opts += cv::format(" -DBLOCK_DIMS_CONTIGUOUS=%d", block_dims_contiguous);
                    opts += cv::format(" -DBLOCK_SIZE=%d", (int)block_size);
                    opts += cv::format(" -DBLOCK_COLS=%d", (int)block_cols);
                    opts += cv::format(" -DBLOCK_ROWS=%d", (int)block_rows);
                    opts += cv::format(" -DBLOCK_SRC_STRIDE=%d", (int)input_base_step);
                }
                else
                {
                    // use 1D copy mode
                    opts += cv::format(" -DUSE_COPY_1D=1");
                    opts += cv::format(" -DBLOCK_DIMS=%d", block_dims_contiguous);
                    opts += cv::format(" -DBLOCK_DIMS_CONTIGUOUS=%d", block_dims_contiguous);
                    opts += cv::format(" -DBLOCK_SIZE=%d", (int)block_size);
                    opts += cv::format(" -DBLOCK_COLS=%d", (int)block_size);
                }
            }
            const size_t MIN_WORK_ITEMS = 16;
            if (block_size <= 4 * MIN_WORK_ITEMS)
                WSZ = 4;
            else if (block_size <= 8 * MIN_WORK_ITEMS)
                WSZ = 8;
            else if (block_size <= 16 * MIN_WORK_ITEMS)
                WSZ = 16;
            else if (block_size <= 32 * MIN_WORK_ITEMS)
                WSZ = 32;
            else if (block_size <= 64 * MIN_WORK_ITEMS)
                WSZ = 64;
            opts += cv::format(" -DWSZ=%d", (int)WSZ);
            size_t local[] = { WSZ, 1 };
            size_t global[] = { WSZ, num_blocks };
            ocl::Kernel kernel(kname.c_str(), ocl::dnn::slice_oclsrc, opts);
            if (kernel.empty())
                return false;
            bool ret = kernel.args(
                    ocl::KernelArg::PtrReadOnly(input),
                    ocl::KernelArg::PtrWriteOnly(output)
                )
                .run(2, global, local, false);
            if (!ret)
                return false;
-        }
+        }  // for outputs.size()
        return true;
 #endif
        }
 #endif
--- a/modules/dnn/src/opencl/slice.cl
+++ b/modules/dnn/src/opencl/slice.cl
@ -1,81 +1,283 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
+// This file is part of OpenCV project.
-//
+// It is subject to the license terms in the LICENSE file found in the top-level directory
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+// of this distribution and at http://opencv.org/license.html.
-//
+
-//  By downloading, copying, installing or using the software you agree to this license.
+// Copyright (C) 2020, Intel Corporation, all rights reserved.
 //  If you do not agree to this license, do not download, install,
 //  copy or use the software.
 //
 //
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2017, Intel Corporation, all rights reserved.
 // Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
 // Third party copyrights are property of their respective owners.
-//
+
-// Redistribution and use in source and binary forms, with or without modification,
+/*
-// are permitted provided that the following conditions are met:
+Specialization constants:
-//
+- WSZ: size of OpenCL local group
-//   * Redistribution's of source code must retain the above copyright notice,
+- DIMS: number of working dimensions
-//     this list of conditions and the following disclaimer.
+- ELEMSIZE: element size in bytes
-//
+- DST_SZ_<i>: dst sizes
-//   * Redistribution's in binary form must reproduce the above copyright notice,
+- SRC_START_<i>: src index shift (slice .start value)
-//     this list of conditions and the following disclaimer in the documentation
+- SRC_STEP_<i>: src steps (bytes)
-//     and/or other materials provided with the distribution.
+- DST_STEP_<i>: dst steps (bytes), derived from DST_SZ_<i> and ELEMSIZE
-//
+- BLOCK_DIMS: number of dims for copy block (argmax(count(SRC_STEP_<i> != DST_STEP_<i>) <= 1))
-//   * The name of the copyright holders may not be used to endorse or promote products
+- BLOCK_DIMS_CONTIGUOUS (<= BLOCK_DIMS): SRC_STEP_<i> == DST_STEP_<i> for i in [0, BLOCK_DIMS_CONTIGUOUS)
-//     derived from this software without specific prior written permission.
+
-//
+derived specialization constants:
-// This software is provided by the copyright holders and contributors "as is" and
+- BLOCK_SIZE: ELEMSIZE * mul(DST_SZ_<i>) for i in [0, BLOCK_DIMS)
-// any express or implied warranties, including, but not limited to, the implied
+
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
+- USE_COPY_1D iff BLOCK_DIMS == BLOCK_DIMS_CONTIGUOUS
-// In no event shall the Intel Corporation or contributors be liable for any direct,
+- BLOCK_COLS:
-// indirect, incidental, special, exemplary, or consequential damages
+  * with USE_COPY_1D: BLOCK_SIZE
-// (including, but not limited to, procurement of substitute goods or services;
+  * w/o USE_COPY_1D: ELEMSIZE * mul(DST_SZ_<i>) for i in [0, BLOCK_DIMS_CONTIGUOUS)
-// loss of use, data, or profits; or business interruption) however caused
+- BLOCK_ROWS:
-// and on any theory of liability, whether in contract, strict liability,
+  * with USE_COPY_1D: N/A
-// or tort (including negligence or otherwise) arising in any way out of
+  * w/o USE_COPY_1D: ELEMSIZE * mul(DST_SZ_<i>) for i in [BLOCK_DIMS_CONTIGUOUS, BLOCK_DIMS)
-// the use of this software, even if advised of the possibility of such damage.
+- BLOCK_SRC_STRIDE:
-//
+  * with USE_COPY_1D: N/A
-//M*/
+  * w/o USE_COPY_1D: ELEMSIZE * mul(SRC_STEP_<i>) for i in [0, BLOCK_DIMS_CONTIGUOUS)
-
+
-#if defined(cl_khr_fp16)
+Note: SZ, STEP values are in reversed order than OpenCV Mat:
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+- NCHW SZ: [cols, rows, channels, batch]
-#endif
+- NCHW STEP: [elemsize, cols * elemsize, rows * cols * elemsize, ...] (DIMS+1 value)
-
+
-__kernel void slice(__global const Dtype* src,
+*/
-                    const int src_plane_size,
+
-                    const int dst_plane_size,
+/*
-                    const int src_cols,
+local: <WSZ, 1, 1>
-                    const int dst_cols,
+global: <WSZ, number_of_copy_blocks, 1>
-                    const int row_offset,
+*/
-                    const int col_offset,
+
-                    __global Dtype* dst)
+#define CONCAT_(A, B) A##B
 #define CONCAT(A, B) CONCAT_(A, B)
 #define BLOCK_COLS_X4 (BLOCK_COLS / 4)
 #define BLOCK_COLS_X16 (BLOCK_COLS / 16)
 #ifdef USE_COPY_1D
 static inline
 __attribute__((always_inline))
 void copy_block_1d(
    __global const uchar* src0,
    const uint src_offset,
    __global uchar* dst0,
    const uint dst_offset
 )
 {
-    unsigned int row_gid = get_group_id(0);
+    __global const uchar* src = src0 + src_offset;
-    unsigned int lid = get_local_id(0);
+    __global uchar* dst = dst0 + dst_offset;
-    const __global Dtype *src_read = src + row_gid * 4 * src_plane_size;
+
-    __global Dtype *dst_read = dst + row_gid * 4 * dst_plane_size;
+    uint processed = 0;
-    Dtype4 a0, a1, a2, a3;
+
-
+#if BLOCK_COLS_X16 >= 4
    int i = lid;
    while( i < dst_plane_size / 4)
    {
-        int row = (4 * i) / dst_cols + row_offset;
+        // uchar16 x 4rows per iteration
-        int col = (4 * i) % dst_cols + col_offset;
+        uint i = get_local_id(0) * 16;  // uchar16
-        int src_index = row * src_cols + col;
+        while (i < BLOCK_COLS_X16 * 16)
        {
            uint4 idx = (uint4)(i, i + 16 * WSZ, i + 32 * WSZ, i + 48 * WSZ);
            idx = select((uint4)i, idx, idx < (BLOCK_COLS_X16 * 16));
-        a0 = vload4(0, src_read + src_index);
+            uchar16 a0 = vload16(0, src + idx.s0);
-        a1 = vload4(0, src_read + src_index + src_plane_size);
+            uchar16 a1 = vload16(0, src + idx.s1);
-        a2 = vload4(0, src_read + src_index + 2 * src_plane_size);
+            uchar16 a2 = vload16(0, src + idx.s2);
-        a3 = vload4(0, src_read + src_index + 3 * src_plane_size);
+            uchar16 a3 = vload16(0, src + idx.s3);
-        vstore4(a0, i, dst_read);
+            vstore16(a0, 0, dst + idx.s0);
-        vstore4(a1, i, dst_read + dst_plane_size);
+            vstore16(a1, 0, dst + idx.s1);
-        vstore4(a2, i, dst_read + 2 * dst_plane_size);
+            vstore16(a2, 0, dst + idx.s2);
-        vstore4(a3, i, dst_read + 3 * dst_plane_size);
+            vstore16(a3, 0, dst + idx.s3);
-        i += get_local_size(0);
+            i += WSZ * 16 * 4;
        }
        processed = BLOCK_COLS_X16 * 16;
    }
 #else
 #define SKIP_1D_BLOCK_COLS_X16 1
 #endif
 #if BLOCK_COLS_X4 > 0 && (defined(SKIP_1D_BLOCK_COLS_X16) || (BLOCK_COLS_X16 * 16 != BLOCK_COLS_X4 * 4))
    {
        // uchar4 x 4rows per iteration
        uint i = get_local_id(0) * 4 + processed;  // uchar4
        while (i < BLOCK_COLS_X4 * 4)
        {
            uint4 idx = (uint4)(i, i + 4 * WSZ, i + 8 * WSZ, i + 12 * WSZ);
            idx = select((uint4)i, idx, idx < (BLOCK_COLS_X4 * 4));
            uchar4 a0 = vload4(0, src + idx.s0);
            uchar4 a1 = vload4(0, src + idx.s1);
            uchar4 a2 = vload4(0, src + idx.s2);
            uchar4 a3 = vload4(0, src + idx.s3);
            vstore4(a0, 0, dst + idx.s0);
            vstore4(a1, 0, dst + idx.s1);
            vstore4(a2, 0, dst + idx.s2);
            vstore4(a3, 0, dst + idx.s3);
            i += WSZ * 4 * 4;
        }
        processed = BLOCK_COLS_X4 * 4;
    }
 #else
 #define SKIP_1D_BLOCK_COLS_X4 1
 #endif  // BLOCK_COLS_X4 > 0
 #if (defined(SKIP_1D_BLOCK_COLS_X16) && defined(SKIP_1D_BLOCK_COLS_X4)) || BLOCK_COLS_X4 * 4 != BLOCK_COLS
    {
        uint i = get_local_id(0) + processed;
        while (i < BLOCK_COLS)
        {
            uchar a0 = src[i];
            dst[i] = a0;
            i += WSZ;
        }
    }
 #endif
 }
 #else  // USE_COPY_1D
 static inline
 __attribute__((always_inline))
 void copy_block_2d(
    __global const uchar* src0,
    const uint src_offset0,
    __global uchar* dst0,
    const uint dst_offset0
 )
 {
    __global const uchar* src = src0 + src_offset0;
    __global uchar* dst = dst0 + dst_offset0;
    uint i = get_local_id(0) * 4;
 #define BLOCK_COLS_FILL_X4 (((BLOCK_COLS + 3) / 4) * 4)
 #define BLOCK_SIZE_FILL_X4 (BLOCK_COLS_FILL_X4 * BLOCK_ROWS)
    while (i < BLOCK_SIZE_FILL_X4)
    {
        int row = i / BLOCK_COLS_FILL_X4;
        int col = i % BLOCK_COLS_FILL_X4;
        uint src_offset = row * BLOCK_SRC_STRIDE + col;
 #if BLOCK_COLS_FILL_X4 == BLOCK_COLS
        uint dst_offset = i;
 #else
        uint dst_offset = row * BLOCK_COLS + col;
 #endif
 #if BLOCK_COLS_FILL_X4 != BLOCK_COLS
        if (col <= BLOCK_COLS - 4)
 #endif
        {
            uchar4 a = vload4(0, src + src_offset);
            vstore4(a, 0, dst + dst_offset);
        }
 #if BLOCK_COLS_FILL_X4 != BLOCK_COLS
        else
        {
            /* non-optimized reference code
            while (col < BLOCK_COLS)
            {
                uchar a = src[src_offset];
                dst[dst_offset] = a;
                col++;
                src_offset++;
                dst_offset++;
            }
            */
            uint4 shift = (uint4)(0, 1, 2, 3);
            shift = select((uint4)0, shift, col + shift < BLOCK_COLS);
            dst[dst_offset + shift.s0] = src[src_offset + shift.s0];
 #if BLOCK_COLS_FILL_X4 - BLOCK_COLS <= 2
            dst[dst_offset + shift.s1] = src[src_offset + shift.s1];
 #endif
 #if BLOCK_COLS_FILL_X4 - BLOCK_COLS <= 1
            dst[dst_offset + shift.s2] = src[src_offset + shift.s2];
 #endif
        }
 #endif  // BLOCK_COLS_FILL_X4 != BLOCK_COLS
        i += WSZ * 4;
    }
 }
 #endif  // USE_COPY_1D
 __kernel void
 CONCAT(slice_, DIMS)(
    __global const uchar* src,
    __global uchar* dst
 )
 {
    uint block_id = get_global_id(1);
    uint dst_offset = block_id * BLOCK_SIZE;
    uint src_offset = 0;
 #define CALC_SRC_INDEX(dim) \
    { \
    uint plane_sz = CONCAT(DST_STEP_, dim) / BLOCK_SIZE; \
    CONCAT(idx_, dim) = block_id / plane_sz; \
    block_id = block_id - CONCAT(idx_, dim) * plane_sz; \
    }
 #define UPDATE_SRC_OFFSET(dim) \
    src_offset = mad24((uint)(CONCAT(idx_, dim) + CONCAT(SRC_START_, dim)), (uint)CONCAT(SRC_STEP_, dim), (uint)src_offset);
 /*
    if (get_global_id(0) == 0 && get_global_id(1) == 0) \
        printf("(%d, %d): @%d src_offset=%d   idx_dim=%d   block_id=%d\n", \
            get_global_id(0), get_global_id(1), \
            dim, src_offset, CONCAT(idx_, dim), block_id \
        );
 */
 #if DIMS > 5
 #error "invalid configuration"
 #endif
 #if DIMS > 4
    uint idx_4 = 0;
 #if BLOCK_DIMS <= 4
    CALC_SRC_INDEX(4)
 #endif
    UPDATE_SRC_OFFSET(4)
 #endif
 #if DIMS > 3
    uint idx_3 = 0;
 #if BLOCK_DIMS <= 3
    CALC_SRC_INDEX(3)
 #endif
    UPDATE_SRC_OFFSET(3)
 #endif
 #if DIMS > 2
    uint idx_2 = 0;
 #if BLOCK_DIMS <= 2
    CALC_SRC_INDEX(2)
 #endif
    UPDATE_SRC_OFFSET(2)
 #endif
 #if DIMS > 1
    uint idx_1 = 0;
 #if BLOCK_DIMS <= 1
    CALC_SRC_INDEX(1)
 #endif
    UPDATE_SRC_OFFSET(1)
 #endif
 #if DIMS > 0
    uint idx_0 = 0;
    UPDATE_SRC_OFFSET(0)
 #endif
 /*
    if (get_global_id(0) == 0)
        printf("(%d, %d): src_offset=%d dst_offset=%d\n",
            get_global_id(0), get_global_id(1),
            src_offset, dst_offset
        );
 */
 #ifdef USE_COPY_1D
    copy_block_1d(src, src_offset, dst, dst_offset);
 #else
    copy_block_2d(src, src_offset, dst, dst_offset);
 #endif
 }
--- a/modules/dnn/test/test_layers.cpp
+++ b/modules/dnn/test/test_layers.cpp
@ -1837,7 +1837,115 @@ TEST_P(Layer_Test_Resize, change_input)
 INSTANTIATE_TEST_CASE_P(/**/, Layer_Test_Resize, dnnBackendsAndTargets());
-typedef testing::TestWithParam<tuple<Backend, Target> > Layer_Test_Slice;
+struct Layer_Test_Slice : public testing::TestWithParam<tuple<Backend, Target> >
 {
    template<int DIMS>
    void test_slice(const int* inputShape, const int* begin, const int* end)
    {
        int backendId = get<0>(GetParam());
        int targetId = get<1>(GetParam());
        Mat input(DIMS, inputShape, CV_32FC1, Scalar::all(0));
        for (int i = 0; i < (int)input.total(); ++i)
            input.ptr<float>()[i] = (float)i;
        std::vector<Range> range(DIMS);
        for (int i = 0; i < DIMS; ++i)
            range[i] = Range(begin[i], end[i]);
        Net net;
        LayerParams lp;
        lp.type = "Slice";
        lp.name = "testLayer";
        lp.set("begin", DictValue::arrayInt<int*>((int*)&begin[0], DIMS));
        lp.set("end", DictValue::arrayInt<int*>((int*)&end[0], DIMS));
        net.addLayerToPrev(lp.name, lp.type, lp);
        {
            net.setInput(input);
            net.setPreferableBackend(backendId);
            net.setPreferableTarget(targetId);
            Mat out = net.forward();
            EXPECT_GT(cv::norm(out, NORM_INF), 0);
            normAssert(out, input(range));
 #if 0
            cout << input(range).clone().reshape(1, 1) << endl;
            cout << out.reshape(1, 1) << endl;
 #endif
        }
    }
 };
 TEST_P(Layer_Test_Slice, slice_channels_17762)
 {
    const int inputShape[4] = {1, 16, 6, 8};
    const int begin[] = {0, 4, 0, 0};
    const int end[] = {1, 8, 6, 8};
    test_slice<4>(inputShape, begin, end);
 }
 TEST_P(Layer_Test_Slice, slice_channels_with_batch_17762)
 {
    const int inputShape[4] = {4, 4, 3, 4};
    const int begin[] = {0, 1, 0, 0};
    const int end[] = {4, 3, 3, 4};
    test_slice<4>(inputShape, begin, end);
 }
 TEST_P(Layer_Test_Slice, slice_channels_and_batch_17762)
 {
    int backend = get<0>(GetParam());
    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
    const int inputShape[4] = {4, 4, 3, 4};
    const int begin[] = {2, 1, 0, 0};
    const int end[] = {4, 3, 3, 4};
    test_slice<4>(inputShape, begin, end);
 }
 TEST_P(Layer_Test_Slice, slice_rows)
 {
    const int inputShape[4] = {1, 2, 6, 4};
    const int begin[] = {0, 0, 4, 0};
    const int end[] = {1, 2, 6, 4};
    test_slice<4>(inputShape, begin, end);
 }
 TEST_P(Layer_Test_Slice, slice_cols)
 {
    const int inputShape[4] = {1, 2, 3, 8};
    const int begin[] = {0, 0, 0, 4};
    const int end[] = {1, 2, 3, 8};
    test_slice<4>(inputShape, begin, end);
 }
 TEST_P(Layer_Test_Slice, slice_complex_1_unaligned)
 {
    const int inputShape[4] = {1, 4, 2, 3};
    const int begin[] = {0, 2, 1, 0};
    const int end[] = {1, 3, 2, 2};
    test_slice<4>(inputShape, begin, end);
 }
 TEST_P(Layer_Test_Slice, slice_complex_2_x4)
 {
    const int inputShape[4] = {1, 3, 2, 4};
    const int begin[] = {0, 2, 1, 0};
    const int end[] = {1, 3, 2, 2};
    test_slice<4>(inputShape, begin, end);
 }
 TEST_P(Layer_Test_Slice, slice_complex_3)
 {
    const int inputShape[4] = {1, 6, 4, 8};
    const int begin[] = {0, 2, 1, 4};
    const int end[] = {1, 4, 3, 8};
    test_slice<4>(inputShape, begin, end);
 }
 TEST_P(Layer_Test_Slice, variable_input_shape)
 {
    int backendId = get<0>(GetParam());