From cc584760d3b631fb1c8a6a3d4240fc0d9f59a856 Mon Sep 17 00:00:00 2001
From: Dmitry Kurtaev <dmitry.kurtaev+github@gmail.com>
Date: Wed, 15 Jul 2020 14:36:13 +0300
Subject: [PATCH 1/6] Fix TensorFlow->ONNX imports

---
 modules/dnn/src/graph_simplifier.cpp               | 3 ---
 modules/dnn/src/tensorflow/tf_graph_simplifier.cpp | 5 ++++-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/modules/dnn/src/graph_simplifier.cpp b/modules/dnn/src/graph_simplifier.cpp
index 166564c215..a23fce30f5 100644
--- a/modules/dnn/src/graph_simplifier.cpp
+++ b/modules/dnn/src/graph_simplifier.cpp
@@ -63,9 +63,6 @@ int Subgraph::getInputNodeId(const Ptr<ImportGraphWrapper>& net,
 {
     CV_Assert(inpId < node->getNumInputs());
     std::string name = node->getInputName(inpId);
-    // If operation produces several tensors, they are specified by index
-    // after ':' character. In example, "input:0".
-    name = name.substr(0, name.rfind(':'));
     const int numNodes = net->getNumNodes();
     for (int i = 0; i < numNodes; ++i)
     {
diff --git a/modules/dnn/src/tensorflow/tf_graph_simplifier.cpp b/modules/dnn/src/tensorflow/tf_graph_simplifier.cpp
index ef9e68a873..354fef0297 100644
--- a/modules/dnn/src/tensorflow/tf_graph_simplifier.cpp
+++ b/modules/dnn/src/tensorflow/tf_graph_simplifier.cpp
@@ -31,7 +31,10 @@ public:
 
     virtual std::string getInputName(int idx) const CV_OVERRIDE
     {
-        return node->input(idx);
+        // If operation produces several tensors, they are specified by index
+        // after ':' character. In example, "input:0".
+        std::string name = node->input(idx);
+        return name.substr(0, name.rfind(':'));
     }
 
     virtual std::string getType() const CV_OVERRIDE

From 435b6df9897461cd8ec4a9f913fdb910c1e5a42d Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.a.alekhin@gmail.com>
Date: Tue, 19 May 2020 11:48:05 +0000
Subject: [PATCH 2/6] dnn: use OpenVINO 2020.4 defines

original commit: 2813aa7eb9c81e7bc8c765d10e508ad134f6b376
---
 cmake/OpenCVDetectInferenceEngine.cmake | 4 ++--
 modules/dnn/src/op_inf_engine.hpp       | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/cmake/OpenCVDetectInferenceEngine.cmake b/cmake/OpenCVDetectInferenceEngine.cmake
index c0379c32b3..3eaf890f32 100644
--- a/cmake/OpenCVDetectInferenceEngine.cmake
+++ b/cmake/OpenCVDetectInferenceEngine.cmake
@@ -135,9 +135,9 @@ endif()
 
 if(INF_ENGINE_TARGET)
   if(NOT INF_ENGINE_RELEASE)
-    message(WARNING "InferenceEngine version has not been set, 2020.3 will be used by default. Set INF_ENGINE_RELEASE variable if you experience build errors.")
+    message(WARNING "InferenceEngine version has not been set, 2020.4 will be used by default. Set INF_ENGINE_RELEASE variable if you experience build errors.")
   endif()
-  set(INF_ENGINE_RELEASE "2020030000" CACHE STRING "Force IE version, should be in form YYYYAABBCC (e.g. 2020.1.0.2 -> 2020010002)")
+  set(INF_ENGINE_RELEASE "2020040000" CACHE STRING "Force IE version, should be in form YYYYAABBCC (e.g. 2020.1.0.2 -> 2020010002)")
   set_target_properties(${INF_ENGINE_TARGET} PROPERTIES
     INTERFACE_COMPILE_DEFINITIONS "HAVE_INF_ENGINE=1;INF_ENGINE_RELEASE=${INF_ENGINE_RELEASE}"
   )
diff --git a/modules/dnn/src/op_inf_engine.hpp b/modules/dnn/src/op_inf_engine.hpp
index 351840f3f2..8c494ac918 100644
--- a/modules/dnn/src/op_inf_engine.hpp
+++ b/modules/dnn/src/op_inf_engine.hpp
@@ -29,8 +29,8 @@
 #define INF_ENGINE_RELEASE_2020_4 2020040000
 
 #ifndef INF_ENGINE_RELEASE
-#warning("IE version have not been provided via command-line. Using 2020.3 by default")
-#define INF_ENGINE_RELEASE INF_ENGINE_RELEASE_2020_3
+#warning("IE version have not been provided via command-line. Using 2020.4 by default")
+#define INF_ENGINE_RELEASE INF_ENGINE_RELEASE_2020_4
 #endif
 
 #define INF_ENGINE_VER_MAJOR_GT(ver) (((INF_ENGINE_RELEASE) / 10000) > ((ver) / 10000))

From f8d6c5b330e7bdb5a74e3f7e634c74a2c400ec1a Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.a.alekhin@gmail.com>
Date: Wed, 15 Jul 2020 20:44:53 +0000
Subject: [PATCH 3/6] winpack_dldt: switch defaults to OpenVINO 2020.4

---
 platforms/winpack_dldt/build_package.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/platforms/winpack_dldt/build_package.py b/platforms/winpack_dldt/build_package.py
index 8f3724135d..c33e07026b 100644
--- a/platforms/winpack_dldt/build_package.py
+++ b/platforms/winpack_dldt/build_package.py
@@ -150,7 +150,7 @@ def git_apply_patch(src_dir, patch_file):
     patch_file = str(patch_file)  # Python 3.5 may not handle Path
     assert os.path.exists(patch_file), patch_file
     execute(cmd=['git', 'apply', '--3way', '-v', '--ignore-space-change', str(patch_file)], cwd=src_dir)
-    execute(cmd=['git', 'diff', 'HEAD'], cwd=src_dir)
+    execute(cmd=['git', '--no-pager', 'diff', 'HEAD'], cwd=src_dir)
 
 
 #===================================================================================================
@@ -443,8 +443,8 @@ class Builder:
 def main():
 
     dldt_src_url = 'https://github.com/openvinotoolkit/openvino'
-    dldt_src_commit = '2020.3.0'
-    dldt_release = '2020030000'
+    dldt_src_commit = '2020.4'
+    dldt_release = '2020040000'
 
     build_cache_dir_default = os.environ.get('BUILD_CACHE_DIR', '.build_cache')
     build_subst_drive = os.environ.get('BUILD_SUBST_DRIVE', None)

From 55e85498394b990ed224b109c7b2b88bf7c2553d Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.a.alekhin@gmail.com>
Date: Wed, 15 Jul 2020 22:10:45 +0000
Subject: [PATCH 4/6] dnn: eliminate IE deprecation warning

---
 modules/dnn/src/op_inf_engine.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/dnn/src/op_inf_engine.hpp b/modules/dnn/src/op_inf_engine.hpp
index 8c494ac918..e8fdada99a 100644
--- a/modules/dnn/src/op_inf_engine.hpp
+++ b/modules/dnn/src/op_inf_engine.hpp
@@ -44,7 +44,7 @@
 #pragma GCC diagnostic ignored "-Wsuggest-override"
 #endif
 
-#ifdef HAVE_DNN_IE_NN_BUILDER_2019
+#if defined(HAVE_DNN_IE_NN_BUILDER_2019) || INF_ENGINE_VER_MAJOR_EQ(INF_ENGINE_RELEASE_2020_4)
 //#define INFERENCE_ENGINE_DEPRECATED  // turn off deprecation warnings from IE
 //there is no way to suppress warnings from IE only at this moment, so we are forced to suppress warnings globally
 #if defined(__GNUC__)
@@ -53,7 +53,7 @@
 #ifdef _MSC_VER
 #pragma warning(disable: 4996)  // was declared deprecated
 #endif
-#endif  // HAVE_DNN_IE_NN_BUILDER_2019
+#endif
 
 #if defined(__GNUC__) && INF_ENGINE_VER_MAJOR_LT(INF_ENGINE_RELEASE_2020_1)
 #pragma GCC visibility push(default)

From 1c371d07b571beb3ba69daf9b7ee7f289f2e0169 Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.a.alekhin@gmail.com>
Date: Wed, 15 Jul 2020 22:52:08 +0000
Subject: [PATCH 5/6] dnn(test): adjust tests for OpenVINO 2020.4

---
 modules/dnn/perf/perf_net.cpp              | 15 +++++++++--
 modules/dnn/test/test_backends.cpp         |  2 ++
 modules/dnn/test/test_caffe_importer.cpp   |  9 +++++--
 modules/dnn/test/test_darknet_importer.cpp | 29 ++++++++++++++++++++--
 modules/dnn/test/test_ie_models.cpp        |  8 +++++-
 modules/dnn/test/test_layers.cpp           | 10 ++++++++
 modules/dnn/test/test_onnx_importer.cpp    | 14 +++++++++++
 7 files changed, 80 insertions(+), 7 deletions(-)

diff --git a/modules/dnn/perf/perf_net.cpp b/modules/dnn/perf/perf_net.cpp
index 3bee2313c0..23ece025e7 100644
--- a/modules/dnn/perf/perf_net.cpp
+++ b/modules/dnn/perf/perf_net.cpp
@@ -196,6 +196,13 @@ PERF_TEST_P_(DNNTestNetwork, YOLOv3)
 {
     if (backend == DNN_BACKEND_HALIDE)
         throw SkipTestException("");
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2020040000)  // nGraph compilation failure
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL)
+        throw SkipTestException("Test is disabled in OpenVINO 2020.4");
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL_FP16)
+        throw SkipTestException("Test is disabled in OpenVINO 2020.4");
+#endif
+
     Mat sample = imread(findDataFile("dnn/dog416.png"));
     cvtColor(sample, sample, COLOR_BGR2RGB);
     Mat inp;
@@ -209,6 +216,12 @@ PERF_TEST_P_(DNNTestNetwork, YOLOv4)
         throw SkipTestException("");
     if (target == DNN_TARGET_MYRIAD)
         throw SkipTestException("");
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2020040000)  // nGraph compilation failure
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL)
+        throw SkipTestException("Test is disabled in OpenVINO 2020.4");
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL_FP16)
+        throw SkipTestException("Test is disabled in OpenVINO 2020.4");
+#endif
     Mat sample = imread(findDataFile("dnn/dog416.png"));
     cvtColor(sample, sample, COLOR_BGR2RGB);
     Mat inp;
@@ -220,8 +233,6 @@ PERF_TEST_P_(DNNTestNetwork, YOLOv4_tiny)
 {
     if (backend == DNN_BACKEND_HALIDE)
         throw SkipTestException("");
-    if (target == DNN_TARGET_MYRIAD)
-        throw SkipTestException("");
     Mat sample = imread(findDataFile("dnn/dog416.png"));
     cvtColor(sample, sample, COLOR_BGR2RGB);
     Mat inp;
diff --git a/modules/dnn/test/test_backends.cpp b/modules/dnn/test/test_backends.cpp
index f1cb2663eb..c88f48754c 100644
--- a/modules/dnn/test/test_backends.cpp
+++ b/modules/dnn/test/test_backends.cpp
@@ -390,6 +390,8 @@ TEST_P(DNNTestNetwork, DenseNet_121)
     if (target == DNN_TARGET_OPENCL_FP16)
     {
         l1 = 2e-2; lInf = 9e-2;
+        if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+            lInf = 0.1f;
     }
     else if (target == DNN_TARGET_MYRIAD)
     {
diff --git a/modules/dnn/test/test_caffe_importer.cpp b/modules/dnn/test/test_caffe_importer.cpp
index 4d4f2d0d10..91a68099ce 100644
--- a/modules/dnn/test/test_caffe_importer.cpp
+++ b/modules/dnn/test/test_caffe_importer.cpp
@@ -489,6 +489,11 @@ TEST_P(Test_Caffe_nets, Colorization)
     {
         l1 = 0.5; lInf = 11;
     }
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL_FP16)
+    {
+        l1 = 0.26; lInf = 6.5;
+    }
+
     normAssert(out, ref, "", l1, lInf);
     expectNoFallbacksFromIE(net);
 }
@@ -515,8 +520,8 @@ TEST_P(Test_Caffe_nets, DenseNet_121)
     float l1 = default_l1, lInf = default_lInf;
     if (target == DNN_TARGET_OPENCL_FP16)
     {
-#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2019020000)
-        l1 = 0.04; lInf = 0.21;
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GE(2019020000)
+        l1 = 0.045; lInf = 0.21;
 #else
         l1 = 0.017; lInf = 0.0795;
 #endif
diff --git a/modules/dnn/test/test_darknet_importer.cpp b/modules/dnn/test/test_darknet_importer.cpp
index f328b29b20..552c1fa111 100644
--- a/modules/dnn/test/test_darknet_importer.cpp
+++ b/modules/dnn/test/test_darknet_importer.cpp
@@ -323,6 +323,12 @@ TEST_P(Test_Darknet_nets, YoloVoc)
         CV_TEST_TAG_LONG
     );
 
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2020040000)  // nGraph compilation failure
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+#endif
 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GE(2019010000)
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && target == DNN_TARGET_OPENCL_FP16)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16);
@@ -363,6 +369,12 @@ TEST_P(Test_Darknet_nets, TinyYoloVoc)
 {
     applyTestTag(CV_TEST_TAG_MEMORY_512MB);
 
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2020040000)  // nGraph compilation failure
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+#endif
 #if defined(INF_ENGINE_RELEASE)
     if ((backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 || backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) &&
         target == DNN_TARGET_MYRIAD && getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X)
@@ -466,6 +478,13 @@ TEST_P(Test_Darknet_nets, YOLOv3)
 {
     applyTestTag(CV_TEST_TAG_LONG, (target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_1GB : CV_TEST_TAG_MEMORY_2GB));
 
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2020040000)  // nGraph compilation failure
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+#endif
+
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_MYRIAD)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
 
@@ -530,6 +549,12 @@ TEST_P(Test_Darknet_nets, YOLOv4)
 {
     applyTestTag(CV_TEST_TAG_LONG, (target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_1GB : CV_TEST_TAG_MEMORY_2GB));
 
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2020040000)  // nGraph compilation failure
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+#endif
 #if defined(INF_ENGINE_RELEASE)
     if (target == DNN_TARGET_MYRIAD)  // NC_OUT_OF_MEMORY
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
@@ -626,7 +651,7 @@ TEST_P(Test_Darknet_nets, YOLOv4_tiny)
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && target == DNN_TARGET_OPENCL)
         iouDiff = std::numeric_limits<double>::quiet_NaN();
     if ((backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 ||
-         backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) && DNN_TARGET_OPENCL_FP16)
+         backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) && target == DNN_TARGET_OPENCL_FP16)
         iouDiff = std::numeric_limits<double>::quiet_NaN();
 #endif
 
@@ -646,7 +671,7 @@ TEST_P(Test_Darknet_nets, YOLOv4_tiny)
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && target == DNN_TARGET_OPENCL)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
     if ((backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 ||
-         backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) && DNN_TARGET_OPENCL_FP16)
+         backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) && target == DNN_TARGET_OPENCL_FP16)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
 #endif
 }
diff --git a/modules/dnn/test/test_ie_models.cpp b/modules/dnn/test/test_ie_models.cpp
index f6685676f9..2ba7d80f58 100644
--- a/modules/dnn/test/test_ie_models.cpp
+++ b/modules/dnn/test/test_ie_models.cpp
@@ -73,7 +73,8 @@ struct OpenVINOModelTestCaseInfo
 static const std::map<std::string, OpenVINOModelTestCaseInfo>& getOpenVINOTestModels()
 {
     static std::map<std::string, OpenVINOModelTestCaseInfo> g_models {
-#if INF_ENGINE_RELEASE >= 2018050000
+#if INF_ENGINE_RELEASE >= 2018050000 && \
+    INF_ENGINE_RELEASE <= 2020999999  // don't use IRv5 models with 2020.1+
         // layout is defined by open_model_zoo/model_downloader
         // Downloaded using these parameters for Open Model Zoo downloader (2019R1):
         // ./downloader.py -o ${OPENCV_DNN_TEST_DATA_PATH}/omz_intel_models --cache_dir ${OPENCV_DNN_TEST_DATA_PATH}/.omz_cache/ \
@@ -295,6 +296,11 @@ TEST_P(DNNTestOpenVINO, models)
     }
 #endif
 
+#if INF_ENGINE_VER_MAJOR_EQ(2020040000)
+    if (targetId == DNN_TARGET_MYRIAD && modelName == "person-detection-retail-0002")  // IRv5, OpenVINO 2020.4 regression
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+#endif
+
     if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
         setInferenceEngineBackendType(CV_DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_API);
     else if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
diff --git a/modules/dnn/test/test_layers.cpp b/modules/dnn/test/test_layers.cpp
index 88f44d3ba7..41a587e45f 100644
--- a/modules/dnn/test/test_layers.cpp
+++ b/modules/dnn/test/test_layers.cpp
@@ -364,6 +364,16 @@ TEST_P(Test_Caffe_layers, layer_prelu_fc)
     // Reference output values are in range [-0.0001, 10.3906]
     double l1 = (target == DNN_TARGET_MYRIAD) ? 0.005 : 0.0;
     double lInf = (target == DNN_TARGET_MYRIAD) ? 0.021 : 0.0;
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2020040000)
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL)
+    {
+        l1 = 0.006f; lInf = 0.05f;
+    }
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL_FP16)
+    {
+        l1 = 0.01f; lInf = 0.05f;
+    }
+#endif
     testLayerUsingCaffeModels("layer_prelu_fc", true, false, l1, lInf);
 }
 
diff --git a/modules/dnn/test/test_onnx_importer.cpp b/modules/dnn/test/test_onnx_importer.cpp
index 181d32d861..4c8e66aae1 100644
--- a/modules/dnn/test/test_onnx_importer.cpp
+++ b/modules/dnn/test/test_onnx_importer.cpp
@@ -704,6 +704,13 @@ TEST_P(Test_ONNX_nets, TinyYolov2)
     // output range: [-11; 8]
     double l1 = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.017 : default_l1;
     double lInf = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.14 : default_lInf;
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2020040000)
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL_FP16)
+    {
+        l1 = 0.018f; lInf = 0.16f;
+    }
+#endif
+
     testONNXModels("tiny_yolo2", pb, l1, lInf);
 }
 
@@ -781,6 +788,13 @@ TEST_P(Test_ONNX_nets, Emotion_ferplus)
         l1 = 2.4e-4;
         lInf = 6e-4;
     }
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2020040000)
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL_FP16)
+    {
+        l1 = 0.012f; lInf = 0.035f;
+    }
+#endif
+
     testONNXModels("emotion_ferplus", pb, l1, lInf);
 }
 

From 81e027eef714cce869ac03cda884a895503bb417 Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.a.alekhin@gmail.com>
Date: Mon, 6 Jul 2020 14:32:42 +0000
Subject: [PATCH 6/6] dnn: fix OpenCL implementation of Slice layer

---
 modules/dnn/perf/perf_layer.cpp        |  95 +++++++
 modules/dnn/src/layers/slice_layer.cpp | 178 ++++++++++---
 modules/dnn/src/opencl/slice.cl        | 348 +++++++++++++++++++------
 modules/dnn/test/test_layers.cpp       | 110 +++++++-
 4 files changed, 624 insertions(+), 107 deletions(-)
 create mode 100644 modules/dnn/perf/perf_layer.cpp

diff --git a/modules/dnn/perf/perf_layer.cpp b/modules/dnn/perf/perf_layer.cpp
new file mode 100644
index 0000000000..06fa57f319
--- /dev/null
+++ b/modules/dnn/perf/perf_layer.cpp
@@ -0,0 +1,95 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "perf_precomp.hpp"
+#include <opencv2/dnn/shape_utils.hpp>
+
+namespace opencv_test {
+
+struct Layer_Slice : public TestBaseWithParam<tuple<Backend, Target> >
+{
+    template<int DIMS>
+    void test_slice(const int* inputShape, const int* begin, const int* end)
+    {
+        int backendId = get<0>(GetParam());
+        int targetId = get<1>(GetParam());
+
+        Mat input(DIMS, inputShape, CV_32FC1, Scalar::all(0));
+        for (int i = 0; i < (int)input.total(); ++i)
+            input.ptr<float>()[i] = (float)(i & 4095);
+
+        std::vector<Range> range(DIMS);
+        for (int i = 0; i < DIMS; ++i)
+            range[i] = Range(begin[i], end[i]);
+
+        Net net;
+        LayerParams lp;
+        lp.type = "Slice";
+        lp.name = "testLayer";
+        lp.set("begin", DictValue::arrayInt<int*>((int*)&begin[0], DIMS));
+        lp.set("end", DictValue::arrayInt<int*>((int*)&end[0], DIMS));
+        net.addLayerToPrev(lp.name, lp.type, lp);
+
+        // warmup
+        {
+            net.setInput(input);
+            net.setPreferableBackend(backendId);
+            net.setPreferableTarget(targetId);
+            Mat out = net.forward();
+
+            EXPECT_GT(cv::norm(out, NORM_INF), 0);
+#if 0
+            //normAssert(out, input(range));
+            cout << input(range).clone().reshape(1, 1) << endl;
+            cout << out.reshape(1, 1) << endl;
+#endif
+        }
+
+        TEST_CYCLE()
+        {
+            Mat res = net.forward();
+        }
+
+        SANITY_CHECK_NOTHING();
+    }
+};
+
+
+
+PERF_TEST_P_(Layer_Slice, YOLOv4_tiny_1)
+{
+    const int inputShape[4] = {1, 64, 104, 104};
+    const int begin[] = {0, 32, 0, 0};
+    const int end[] = {1, 64, 104, 104};
+    test_slice<4>(inputShape, begin, end);
+}
+
+PERF_TEST_P_(Layer_Slice, YOLOv4_tiny_2)
+{
+    const int inputShape[4] = {1, 128, 52, 52};
+    const int begin[] = {0, 64, 0, 0};
+    const int end[] = {1, 128, 52, 52};
+    test_slice<4>(inputShape, begin, end);
+}
+
+PERF_TEST_P_(Layer_Slice, YOLOv4_tiny_3)
+{
+    const int inputShape[4] = {1, 256, 26, 26};
+    const int begin[] = {0, 128, 0, 0};
+    const int end[] = {1, 256, 26, 26};
+    test_slice<4>(inputShape, begin, end);
+}
+
+
+PERF_TEST_P_(Layer_Slice, FastNeuralStyle_eccv16)
+{
+    const int inputShape[4] = {1, 128, 80, 100};
+    const int begin[] = {0, 0, 2, 2};
+    const int end[] = {1, 128, 76, 96};
+    test_slice<4>(inputShape, begin, end);
+}
+
+INSTANTIATE_TEST_CASE_P(/**/, Layer_Slice, dnnBackendsAndTargets(false, false));
+
+} // namespace
diff --git a/modules/dnn/src/layers/slice_layer.cpp b/modules/dnn/src/layers/slice_layer.cpp
index a16384cbd4..d7d541474e 100644
--- a/modules/dnn/src/layers/slice_layer.cpp
+++ b/modules/dnn/src/layers/slice_layer.cpp
@@ -47,6 +47,8 @@
 #include "layers_common.hpp"
 #include <opencv2/dnn/shape_utils.hpp>
 
+#include <opencv2/core/utils/logger.hpp>
+
 #ifdef HAVE_OPENCL
 #include "opencl_kernels_dnn.hpp"
 #endif
@@ -197,58 +199,168 @@ public:
                 finalSliceRanges[i][j] = clamp(finalSliceRanges[i][j], inpShape[j]);
             }
         }
+
+#if 0
+        std::cout << "DEBUG: DNN/Slice: " << outputs.size() << " inpShape=" << inpShape << std::endl;
+        for (int i = 0; i < outputs.size(); ++i)
+        {
+            for (int j = 0; j < finalSliceRanges[i].size(); ++j)
+            {
+                std::cout << finalSliceRanges[i][j];
+            }
+            std::cout << std::endl;
+        }
+#endif
     }
 
 #ifdef HAVE_OPENCL
     bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
     {
-#if 1
-        // TODO fix that (brokes YOLOv4-tiny)
-        return false;
-#else
         std::vector<UMat> inputs;
         std::vector<UMat> outputs;
 
-        bool use_half = (inputs_.depth() == CV_16S);
         inputs_.getUMatVector(inputs);
         outputs_.getUMatVector(outputs);
 
-        if (inputs[0].dims < 4 || (total(shape(outputs[0]), 0, 2) % 4 != 0) ||
-            (total(shape(outputs[0]), 2) % 4 != 0))
+        CV_Assert(outputs.size() == finalSliceRanges.size());
+
+        const UMat& input = inputs[0];
+        if (input.dims > 5)
+        {
+            CV_LOG_INFO(NULL, "DNN/OpenCL/Slice: implementation doesn't support dims=" << input.dims << ". Fallback to CPU");
             return false;
+        }
 
-        String opts;
-        if (use_half)
-            opts = "-DDtype=half -DDtype4=half4 -DDtype8=half8";
-        else
-            opts = "-DDtype=float -DDtype4=float4 -DDtype8=float8";
-        const UMat& inpMat = inputs[0];
+        size_t WSZ = 128;
+
+        const int dims = input.dims;
+        const int elemSize = (int)input.elemSize();
+        String opts0 = cv::format(
+                "-DDIMS=%d -DELEMSIZE=%d",
+                dims, elemSize
+            );
+        for (int d = 0; d < dims; d++)
+        {
+            opts0 += cv::format(" -DSRC_STEP_%d=%d", d, (int)input.step[dims - 1 - d]);
+        }
+        String kname = cv::format("slice_%d", dims);
         for (size_t i = 0; i < outputs.size(); i++)
         {
-            int groups = outputs[i].size[0];
-            int channels = outputs[i].size[1];
-            int rows = outputs[i].size[2];
-            int cols = outputs[i].size[3];
-
-            ocl::Kernel kernel("slice", ocl::dnn::slice_oclsrc, opts);
-            size_t local[] = { 128 };
-            size_t global[] = { (size_t)groups * channels / 4 * local[0] };
-            int idx = 0;
-            kernel.set(idx++, ocl::KernelArg::PtrReadOnly(inpMat));
-            kernel.set(idx++, (int)(inpMat.size[2] * inpMat.size[3]));
-            kernel.set(idx++, (int)(rows * cols));
-            kernel.set(idx++, (int)inpMat.size[3]);
-            kernel.set(idx++, (int)cols);
-            kernel.set(idx++, (int)finalSliceRanges[i][2].start);
-            kernel.set(idx++, (int)finalSliceRanges[i][3].start);
-            kernel.set(idx++, ocl::KernelArg::PtrWriteOnly(outputs[i]));
-            bool ret = kernel.run(1, global, local, false);
+            UMat& output = outputs[i];
+            const std::vector<Range>& range = finalSliceRanges[i];
+
+            String opts = opts0;
+
+            CV_CheckEQ(output.dims, dims, "");
+            for (int d = 0; d < dims; d++)
+            {
+                opts += cv::format(" -DDST_STEP_%d=%d -DDST_SZ_%d=%d -DSRC_START_%d=%d",
+                        d, (int)output.step[dims - 1 - d],
+                        d, (int)output.size[dims - 1 - d],
+                        d, (int)range[dims - 1 - d].start
+                    );
+                CV_CheckEQ(range[d].size(), (int)output.size[d], "");
+            }
+
+            int block_dims = 0;
+            size_t block_size = elemSize;
+            for (int i = dims - 1; i >= 0; --i)
+            {
+                if (input.step[i] != output.step[i])
+                    break;
+                block_size *= output.size[i];
+                block_dims++;
+            }
+
+            const size_t total = output.total() * elemSize;
+            size_t num_blocks = total / block_size;
+
+            if ((num_blocks <= 8 && block_size >= WSZ * 4) || (block_size >= WSZ * 64))
+            {
+                // use 1D copy mode
+                opts += cv::format(" -DUSE_COPY_1D=1");
+
+                opts += cv::format(" -DBLOCK_DIMS=%d", block_dims);
+                opts += cv::format(" -DBLOCK_DIMS_CONTIGUOUS=%d", block_dims);
+                opts += cv::format(" -DBLOCK_SIZE=%d", (int)block_size);
+
+                opts += cv::format(" -DBLOCK_COLS=%d", (int)block_size);
+            }
+            else
+            {
+                // use 2D copy mode
+                int block_cols = block_size;
+                int block_dims_contiguous = block_dims;
+                size_t input_base_step = input.step[dims - 1 - block_dims_contiguous];
+                size_t output_base_step = output.step[dims - 1 - block_dims_contiguous];
+
+                size_t block_rows = 1;
+                for (int i = dims - 1 - block_dims_contiguous; i >= 0; --i)
+                {
+                    if (input.step[i] * output_base_step != output.step[i] * input_base_step)
+                        break;
+                    block_rows *= output.size[i];
+                    block_dims++;
+                }
+
+                block_size *= block_rows;
+
+                num_blocks = total / block_size;
+
+                if (block_rows > 1)
+                {
+                    opts += cv::format(" -DBLOCK_DIMS=%d", block_dims);
+                    opts += cv::format(" -DBLOCK_DIMS_CONTIGUOUS=%d", block_dims_contiguous);
+                    opts += cv::format(" -DBLOCK_SIZE=%d", (int)block_size);
+
+                    opts += cv::format(" -DBLOCK_COLS=%d", (int)block_cols);
+
+                    opts += cv::format(" -DBLOCK_ROWS=%d", (int)block_rows);
+                    opts += cv::format(" -DBLOCK_SRC_STRIDE=%d", (int)input_base_step);
+                }
+                else
+                {
+                    // use 1D copy mode
+                    opts += cv::format(" -DUSE_COPY_1D=1");
+
+                    opts += cv::format(" -DBLOCK_DIMS=%d", block_dims_contiguous);
+                    opts += cv::format(" -DBLOCK_DIMS_CONTIGUOUS=%d", block_dims_contiguous);
+                    opts += cv::format(" -DBLOCK_SIZE=%d", (int)block_size);
+
+                    opts += cv::format(" -DBLOCK_COLS=%d", (int)block_size);
+                }
+            }
+
+            const size_t MIN_WORK_ITEMS = 16;
+            if (block_size <= 4 * MIN_WORK_ITEMS)
+                WSZ = 4;
+            else if (block_size <= 8 * MIN_WORK_ITEMS)
+                WSZ = 8;
+            else if (block_size <= 16 * MIN_WORK_ITEMS)
+                WSZ = 16;
+            else if (block_size <= 32 * MIN_WORK_ITEMS)
+                WSZ = 32;
+            else if (block_size <= 64 * MIN_WORK_ITEMS)
+                WSZ = 64;
+
+            opts += cv::format(" -DWSZ=%d", (int)WSZ);
+
+            size_t local[] = { WSZ, 1 };
+            size_t global[] = { WSZ, num_blocks };
+
+            ocl::Kernel kernel(kname.c_str(), ocl::dnn::slice_oclsrc, opts);
+            if (kernel.empty())
+                return false;
+            bool ret = kernel.args(
+                    ocl::KernelArg::PtrReadOnly(input),
+                    ocl::KernelArg::PtrWriteOnly(output)
+                )
+                .run(2, global, local, false);
             if (!ret)
                 return false;
-        }
+        }  // for outputs.size()
 
         return true;
-#endif
         }
 #endif
 
diff --git a/modules/dnn/src/opencl/slice.cl b/modules/dnn/src/opencl/slice.cl
index 5f96a4e4c8..d468dbc16a 100644
--- a/modules/dnn/src/opencl/slice.cl
+++ b/modules/dnn/src/opencl/slice.cl
@@ -1,81 +1,283 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2017, Intel Corporation, all rights reserved.
-// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2020, Intel Corporation, all rights reserved.
 // Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if defined(cl_khr_fp16)
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-#endif
-
-__kernel void slice(__global const Dtype* src,
-                    const int src_plane_size,
-                    const int dst_plane_size,
-                    const int src_cols,
-                    const int dst_cols,
-                    const int row_offset,
-                    const int col_offset,
-                    __global Dtype* dst)
+
+/*
+Specialization constants:
+- WSZ: size of OpenCL local group
+- DIMS: number of working dimensions
+- ELEMSIZE: element size in bytes
+- DST_SZ_<i>: dst sizes
+- SRC_START_<i>: src index shift (slice .start value)
+- SRC_STEP_<i>: src steps (bytes)
+- DST_STEP_<i>: dst steps (bytes), derived from DST_SZ_<i> and ELEMSIZE
+- BLOCK_DIMS: number of dims for copy block (argmax(count(SRC_STEP_<i> != DST_STEP_<i>) <= 1))
+- BLOCK_DIMS_CONTIGUOUS (<= BLOCK_DIMS): SRC_STEP_<i> == DST_STEP_<i> for i in [0, BLOCK_DIMS_CONTIGUOUS)
+
+derived specialization constants:
+- BLOCK_SIZE: ELEMSIZE * mul(DST_SZ_<i>) for i in [0, BLOCK_DIMS)
+
+- USE_COPY_1D iff BLOCK_DIMS == BLOCK_DIMS_CONTIGUOUS
+- BLOCK_COLS:
+  * with USE_COPY_1D: BLOCK_SIZE
+  * w/o USE_COPY_1D: ELEMSIZE * mul(DST_SZ_<i>) for i in [0, BLOCK_DIMS_CONTIGUOUS)
+- BLOCK_ROWS:
+  * with USE_COPY_1D: N/A
+  * w/o USE_COPY_1D: ELEMSIZE * mul(DST_SZ_<i>) for i in [BLOCK_DIMS_CONTIGUOUS, BLOCK_DIMS)
+- BLOCK_SRC_STRIDE:
+  * with USE_COPY_1D: N/A
+  * w/o USE_COPY_1D: ELEMSIZE * mul(SRC_STEP_<i>) for i in [0, BLOCK_DIMS_CONTIGUOUS)
+
+Note: SZ, STEP values are in reversed order than OpenCV Mat:
+- NCHW SZ: [cols, rows, channels, batch]
+- NCHW STEP: [elemsize, cols * elemsize, rows * cols * elemsize, ...] (DIMS+1 value)
+
+*/
+
+/*
+local: <WSZ, 1, 1>
+global: <WSZ, number_of_copy_blocks, 1>
+*/
+
+#define CONCAT_(A, B) A##B
+#define CONCAT(A, B) CONCAT_(A, B)
+
+#define BLOCK_COLS_X4 (BLOCK_COLS / 4)
+#define BLOCK_COLS_X16 (BLOCK_COLS / 16)
+
+#ifdef USE_COPY_1D
+
+static inline
+__attribute__((always_inline))
+void copy_block_1d(
+    __global const uchar* src0,
+    const uint src_offset,
+    __global uchar* dst0,
+    const uint dst_offset
+)
 {
-    unsigned int row_gid = get_group_id(0);
-    unsigned int lid = get_local_id(0);
-    const __global Dtype *src_read = src + row_gid * 4 * src_plane_size;
-    __global Dtype *dst_read = dst + row_gid * 4 * dst_plane_size;
-    Dtype4 a0, a1, a2, a3;
-
-    int i = lid;
-    while( i < dst_plane_size / 4)
+    __global const uchar* src = src0 + src_offset;
+    __global uchar* dst = dst0 + dst_offset;
+
+    uint processed = 0;
+
+#if BLOCK_COLS_X16 >= 4
     {
-        int row = (4 * i) / dst_cols + row_offset;
-        int col = (4 * i) % dst_cols + col_offset;
-        int src_index = row * src_cols + col;
+        // uchar16 x 4rows per iteration
+        uint i = get_local_id(0) * 16;  // uchar16
+        while (i < BLOCK_COLS_X16 * 16)
+        {
+            uint4 idx = (uint4)(i, i + 16 * WSZ, i + 32 * WSZ, i + 48 * WSZ);
+            idx = select((uint4)i, idx, idx < (BLOCK_COLS_X16 * 16));
 
-        a0 = vload4(0, src_read + src_index);
-        a1 = vload4(0, src_read + src_index + src_plane_size);
-        a2 = vload4(0, src_read + src_index + 2 * src_plane_size);
-        a3 = vload4(0, src_read + src_index + 3 * src_plane_size);
+            uchar16 a0 = vload16(0, src + idx.s0);
+            uchar16 a1 = vload16(0, src + idx.s1);
+            uchar16 a2 = vload16(0, src + idx.s2);
+            uchar16 a3 = vload16(0, src + idx.s3);
 
-        vstore4(a0, i, dst_read);
-        vstore4(a1, i, dst_read + dst_plane_size);
-        vstore4(a2, i, dst_read + 2 * dst_plane_size);
-        vstore4(a3, i, dst_read + 3 * dst_plane_size);
+            vstore16(a0, 0, dst + idx.s0);
+            vstore16(a1, 0, dst + idx.s1);
+            vstore16(a2, 0, dst + idx.s2);
+            vstore16(a3, 0, dst + idx.s3);
 
-        i += get_local_size(0);
+            i += WSZ * 16 * 4;
+        }
+        processed = BLOCK_COLS_X16 * 16;
     }
+#else
+#define SKIP_1D_BLOCK_COLS_X16 1
+#endif
+
+#if BLOCK_COLS_X4 > 0 && (defined(SKIP_1D_BLOCK_COLS_X16) || (BLOCK_COLS_X16 * 16 != BLOCK_COLS_X4 * 4))
+    {
+        // uchar4 x 4rows per iteration
+        uint i = get_local_id(0) * 4 + processed;  // uchar4
+        while (i < BLOCK_COLS_X4 * 4)
+        {
+            uint4 idx = (uint4)(i, i + 4 * WSZ, i + 8 * WSZ, i + 12 * WSZ);
+            idx = select((uint4)i, idx, idx < (BLOCK_COLS_X4 * 4));
+
+            uchar4 a0 = vload4(0, src + idx.s0);
+            uchar4 a1 = vload4(0, src + idx.s1);
+            uchar4 a2 = vload4(0, src + idx.s2);
+            uchar4 a3 = vload4(0, src + idx.s3);
+
+            vstore4(a0, 0, dst + idx.s0);
+            vstore4(a1, 0, dst + idx.s1);
+            vstore4(a2, 0, dst + idx.s2);
+            vstore4(a3, 0, dst + idx.s3);
+
+            i += WSZ * 4 * 4;
+        }
+        processed = BLOCK_COLS_X4 * 4;
+    }
+#else
+#define SKIP_1D_BLOCK_COLS_X4 1
+#endif  // BLOCK_COLS_X4 > 0
+
+#if (defined(SKIP_1D_BLOCK_COLS_X16) && defined(SKIP_1D_BLOCK_COLS_X4)) || BLOCK_COLS_X4 * 4 != BLOCK_COLS
+    {
+        uint i = get_local_id(0) + processed;
+        while (i < BLOCK_COLS)
+        {
+            uchar a0 = src[i];
+            dst[i] = a0;
+
+            i += WSZ;
+        }
+    }
+#endif
+}
+
+#else  // USE_COPY_1D
+
+static inline
+__attribute__((always_inline))
+void copy_block_2d(
+    __global const uchar* src0,
+    const uint src_offset0,
+    __global uchar* dst0,
+    const uint dst_offset0
+)
+{
+    __global const uchar* src = src0 + src_offset0;
+    __global uchar* dst = dst0 + dst_offset0;
+
+    uint i = get_local_id(0) * 4;
+
+#define BLOCK_COLS_FILL_X4 (((BLOCK_COLS + 3) / 4) * 4)
+#define BLOCK_SIZE_FILL_X4 (BLOCK_COLS_FILL_X4 * BLOCK_ROWS)
+
+    while (i < BLOCK_SIZE_FILL_X4)
+    {
+        int row = i / BLOCK_COLS_FILL_X4;
+        int col = i % BLOCK_COLS_FILL_X4;
+
+        uint src_offset = row * BLOCK_SRC_STRIDE + col;
+#if BLOCK_COLS_FILL_X4 == BLOCK_COLS
+        uint dst_offset = i;
+#else
+        uint dst_offset = row * BLOCK_COLS + col;
+#endif
+
+#if BLOCK_COLS_FILL_X4 != BLOCK_COLS
+        if (col <= BLOCK_COLS - 4)
+#endif
+        {
+            uchar4 a = vload4(0, src + src_offset);
+            vstore4(a, 0, dst + dst_offset);
+        }
+#if BLOCK_COLS_FILL_X4 != BLOCK_COLS
+        else
+        {
+            /* non-optimized reference code
+            while (col < BLOCK_COLS)
+            {
+                uchar a = src[src_offset];
+                dst[dst_offset] = a;
+                col++;
+                src_offset++;
+                dst_offset++;
+            }
+            */
+
+            uint4 shift = (uint4)(0, 1, 2, 3);
+            shift = select((uint4)0, shift, col + shift < BLOCK_COLS);
+
+            dst[dst_offset + shift.s0] = src[src_offset + shift.s0];
+
+#if BLOCK_COLS_FILL_X4 - BLOCK_COLS <= 2
+            dst[dst_offset + shift.s1] = src[src_offset + shift.s1];
+#endif
+#if BLOCK_COLS_FILL_X4 - BLOCK_COLS <= 1
+            dst[dst_offset + shift.s2] = src[src_offset + shift.s2];
+#endif
+        }
+#endif  // BLOCK_COLS_FILL_X4 != BLOCK_COLS
+        i += WSZ * 4;
+    }
+}
+
+#endif  // USE_COPY_1D
+
+__kernel void
+CONCAT(slice_, DIMS)(
+    __global const uchar* src,
+    __global uchar* dst
+)
+{
+    uint block_id = get_global_id(1);
+
+    uint dst_offset = block_id * BLOCK_SIZE;
+
+    uint src_offset = 0;
+
+#define CALC_SRC_INDEX(dim) \
+    { \
+    uint plane_sz = CONCAT(DST_STEP_, dim) / BLOCK_SIZE; \
+    CONCAT(idx_, dim) = block_id / plane_sz; \
+    block_id = block_id - CONCAT(idx_, dim) * plane_sz; \
+    }
+#define UPDATE_SRC_OFFSET(dim) \
+    src_offset = mad24((uint)(CONCAT(idx_, dim) + CONCAT(SRC_START_, dim)), (uint)CONCAT(SRC_STEP_, dim), (uint)src_offset);
+/*
+    if (get_global_id(0) == 0 && get_global_id(1) == 0) \
+        printf("(%d, %d): @%d src_offset=%d   idx_dim=%d   block_id=%d\n", \
+            get_global_id(0), get_global_id(1), \
+            dim, src_offset, CONCAT(idx_, dim), block_id \
+        );
+*/
+
+#if DIMS > 5
+#error "invalid configuration"
+#endif
+#if DIMS > 4
+    uint idx_4 = 0;
+#if BLOCK_DIMS <= 4
+    CALC_SRC_INDEX(4)
+#endif
+    UPDATE_SRC_OFFSET(4)
+#endif
+#if DIMS > 3
+    uint idx_3 = 0;
+#if BLOCK_DIMS <= 3
+    CALC_SRC_INDEX(3)
+#endif
+    UPDATE_SRC_OFFSET(3)
+#endif
+#if DIMS > 2
+    uint idx_2 = 0;
+#if BLOCK_DIMS <= 2
+    CALC_SRC_INDEX(2)
+#endif
+    UPDATE_SRC_OFFSET(2)
+#endif
+#if DIMS > 1
+    uint idx_1 = 0;
+#if BLOCK_DIMS <= 1
+    CALC_SRC_INDEX(1)
+#endif
+    UPDATE_SRC_OFFSET(1)
+#endif
+#if DIMS > 0
+    uint idx_0 = 0;
+    UPDATE_SRC_OFFSET(0)
+#endif
+
+/*
+    if (get_global_id(0) == 0)
+        printf("(%d, %d): src_offset=%d dst_offset=%d\n",
+            get_global_id(0), get_global_id(1),
+            src_offset, dst_offset
+        );
+*/
+
+#ifdef USE_COPY_1D
+    copy_block_1d(src, src_offset, dst, dst_offset);
+#else
+    copy_block_2d(src, src_offset, dst, dst_offset);
+#endif
 }
diff --git a/modules/dnn/test/test_layers.cpp b/modules/dnn/test/test_layers.cpp
index 88f44d3ba7..fbf136ca66 100644
--- a/modules/dnn/test/test_layers.cpp
+++ b/modules/dnn/test/test_layers.cpp
@@ -1837,7 +1837,115 @@ TEST_P(Layer_Test_Resize, change_input)
 
 INSTANTIATE_TEST_CASE_P(/**/, Layer_Test_Resize, dnnBackendsAndTargets());
 
-typedef testing::TestWithParam<tuple<Backend, Target> > Layer_Test_Slice;
+struct Layer_Test_Slice : public testing::TestWithParam<tuple<Backend, Target> >
+{
+    template<int DIMS>
+    void test_slice(const int* inputShape, const int* begin, const int* end)
+    {
+        int backendId = get<0>(GetParam());
+        int targetId = get<1>(GetParam());
+
+        Mat input(DIMS, inputShape, CV_32FC1, Scalar::all(0));
+        for (int i = 0; i < (int)input.total(); ++i)
+            input.ptr<float>()[i] = (float)i;
+
+        std::vector<Range> range(DIMS);
+        for (int i = 0; i < DIMS; ++i)
+            range[i] = Range(begin[i], end[i]);
+
+        Net net;
+        LayerParams lp;
+        lp.type = "Slice";
+        lp.name = "testLayer";
+        lp.set("begin", DictValue::arrayInt<int*>((int*)&begin[0], DIMS));
+        lp.set("end", DictValue::arrayInt<int*>((int*)&end[0], DIMS));
+        net.addLayerToPrev(lp.name, lp.type, lp);
+
+        {
+            net.setInput(input);
+            net.setPreferableBackend(backendId);
+            net.setPreferableTarget(targetId);
+            Mat out = net.forward();
+
+            EXPECT_GT(cv::norm(out, NORM_INF), 0);
+            normAssert(out, input(range));
+#if 0
+            cout << input(range).clone().reshape(1, 1) << endl;
+            cout << out.reshape(1, 1) << endl;
+#endif
+        }
+    }
+};
+
+TEST_P(Layer_Test_Slice, slice_channels_17762)
+{
+    const int inputShape[4] = {1, 16, 6, 8};
+    const int begin[] = {0, 4, 0, 0};
+    const int end[] = {1, 8, 6, 8};
+    test_slice<4>(inputShape, begin, end);
+}
+
+TEST_P(Layer_Test_Slice, slice_channels_with_batch_17762)
+{
+    const int inputShape[4] = {4, 4, 3, 4};
+    const int begin[] = {0, 1, 0, 0};
+    const int end[] = {4, 3, 3, 4};
+    test_slice<4>(inputShape, begin, end);
+}
+
+TEST_P(Layer_Test_Slice, slice_channels_and_batch_17762)
+{
+    int backend = get<0>(GetParam());
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+
+    const int inputShape[4] = {4, 4, 3, 4};
+    const int begin[] = {2, 1, 0, 0};
+    const int end[] = {4, 3, 3, 4};
+    test_slice<4>(inputShape, begin, end);
+}
+
+TEST_P(Layer_Test_Slice, slice_rows)
+{
+    const int inputShape[4] = {1, 2, 6, 4};
+    const int begin[] = {0, 0, 4, 0};
+    const int end[] = {1, 2, 6, 4};
+    test_slice<4>(inputShape, begin, end);
+}
+
+TEST_P(Layer_Test_Slice, slice_cols)
+{
+    const int inputShape[4] = {1, 2, 3, 8};
+    const int begin[] = {0, 0, 0, 4};
+    const int end[] = {1, 2, 3, 8};
+    test_slice<4>(inputShape, begin, end);
+}
+
+
+TEST_P(Layer_Test_Slice, slice_complex_1_unaligned)
+{
+    const int inputShape[4] = {1, 4, 2, 3};
+    const int begin[] = {0, 2, 1, 0};
+    const int end[] = {1, 3, 2, 2};
+    test_slice<4>(inputShape, begin, end);
+}
+
+TEST_P(Layer_Test_Slice, slice_complex_2_x4)
+{
+    const int inputShape[4] = {1, 3, 2, 4};
+    const int begin[] = {0, 2, 1, 0};
+    const int end[] = {1, 3, 2, 2};
+    test_slice<4>(inputShape, begin, end);
+}
+
+TEST_P(Layer_Test_Slice, slice_complex_3)
+{
+    const int inputShape[4] = {1, 6, 4, 8};
+    const int begin[] = {0, 2, 1, 4};
+    const int end[] = {1, 4, 3, 8};
+    test_slice<4>(inputShape, begin, end);
+}
+
 TEST_P(Layer_Test_Slice, variable_input_shape)
 {
     int backendId = get<0>(GetParam());