From 452fa3011ca7eff4d99d7ffaf738ca7012023fbb Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.alekhin@intel.com>
Date: Tue, 10 Jul 2018 15:00:42 +0300
Subject: [PATCH 1/3] dnn(test): drop CV_ENUM for DNNBackend / DNNTarget

---
 modules/dnn/test/test_backends.cpp       | 18 +++----
 modules/dnn/test/test_caffe_importer.cpp | 10 ++--
 modules/dnn/test/test_googlenet.cpp      |  2 +-
 modules/dnn/test/test_halide_layers.cpp  | 48 +++++++++----------
 modules/dnn/test/test_precomp.hpp        | 61 +++++++++++++++++-------
 modules/dnn/test/test_tf_importer.cpp    |  2 +-
 modules/dnn/test/test_torch_importer.cpp |  4 +-
 7 files changed, 85 insertions(+), 60 deletions(-)
diff --git a/modules/dnn/test/test_backends.cpp b/modules/dnn/test/test_backends.cpp
index ad7eb0917e..49e1a2a983 100644
--- a/modules/dnn/test/test_backends.cpp
+++ b/modules/dnn/test/test_backends.cpp
@@ -278,19 +278,19 @@ TEST_P(DNNTestNetwork, FastNeuralStyle_eccv16)
     processNet("dnn/fast_neural_style_eccv16_starry_night.t7", "", inp, "", "", l1, lInf);
 }
 
-const tuple<DNNBackend, DNNTarget> testCases[] = {
+const tuple<Backend, Target> testCases[] = {
 #ifdef HAVE_HALIDE
-    tuple<DNNBackend, DNNTarget>(DNN_BACKEND_HALIDE, DNN_TARGET_CPU),
-    tuple<DNNBackend, DNNTarget>(DNN_BACKEND_HALIDE, DNN_TARGET_OPENCL),
+    tuple<Backend, Target>(DNN_BACKEND_HALIDE, DNN_TARGET_CPU),
+    tuple<Backend, Target>(DNN_BACKEND_HALIDE, DNN_TARGET_OPENCL),
 #endif
 #ifdef HAVE_INF_ENGINE
-    tuple<DNNBackend, DNNTarget>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_CPU),
-    tuple<DNNBackend, DNNTarget>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL),
-    tuple<DNNBackend, DNNTarget>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL_FP16),
-    tuple<DNNBackend, DNNTarget>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_MYRIAD),
+    tuple<Backend, Target>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_CPU),
+    tuple<Backend, Target>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL),
+    tuple<Backend, Target>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL_FP16),
+    tuple<Backend, Target>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_MYRIAD),
 #endif
-    tuple<DNNBackend, DNNTarget>(DNN_BACKEND_OPENCV, DNN_TARGET_OPENCL),
-    tuple<DNNBackend, DNNTarget>(DNN_BACKEND_OPENCV, DNN_TARGET_OPENCL_FP16)
+    tuple<Backend, Target>(DNN_BACKEND_OPENCV, DNN_TARGET_OPENCL),
+    tuple<Backend, Target>(DNN_BACKEND_OPENCV, DNN_TARGET_OPENCL_FP16)
 };
 
 INSTANTIATE_TEST_CASE_P(/*nothing*/, DNNTestNetwork, testing::ValuesIn(testCases));
diff --git a/modules/dnn/test/test_caffe_importer.cpp b/modules/dnn/test/test_caffe_importer.cpp
index c99e4e63f7..5365b2a435 100644
--- a/modules/dnn/test/test_caffe_importer.cpp
+++ b/modules/dnn/test/test_caffe_importer.cpp
@@ -82,7 +82,7 @@ TEST(Test_Caffe, read_googlenet)
     ASSERT_FALSE(net.empty());
 }
 
-typedef testing::TestWithParam<tuple<bool, DNNTarget> > Reproducibility_AlexNet;
+typedef testing::TestWithParam<tuple<bool, Target> > Reproducibility_AlexNet;
 TEST_P(Reproducibility_AlexNet, Accuracy)
 {
     bool readFromMemory = get<0>(GetParam());
@@ -179,7 +179,7 @@ TEST(Reproducibility_SSD, Accuracy)
     normAssertDetections(ref, out);
 }
 
-typedef testing::TestWithParam<DNNTarget> Reproducibility_MobileNet_SSD;
+typedef testing::TestWithParam<Target> Reproducibility_MobileNet_SSD;
 TEST_P(Reproducibility_MobileNet_SSD, Accuracy)
 {
     const string proto = findDataFile("dnn/MobileNetSSD_deploy.prototxt", false);
@@ -234,7 +234,7 @@ TEST_P(Reproducibility_MobileNet_SSD, Accuracy)
 INSTANTIATE_TEST_CASE_P(/**/, Reproducibility_MobileNet_SSD,
                         Values(DNN_TARGET_CPU, DNN_TARGET_OPENCL, DNN_TARGET_OPENCL_FP16));
 
-typedef testing::TestWithParam<DNNTarget> Reproducibility_ResNet50;
+typedef testing::TestWithParam<Target> Reproducibility_ResNet50;
 TEST_P(Reproducibility_ResNet50, Accuracy)
 {
     Net net = readNetFromCaffe(findDataFile("dnn/ResNet-50-deploy.prototxt", false),
@@ -270,7 +270,7 @@ TEST_P(Reproducibility_ResNet50, Accuracy)
 INSTANTIATE_TEST_CASE_P(/**/, Reproducibility_ResNet50,
                         Values(DNN_TARGET_CPU, DNN_TARGET_OPENCL, DNN_TARGET_OPENCL_FP16));
 
-typedef testing::TestWithParam<DNNTarget> Reproducibility_SqueezeNet_v1_1;
+typedef testing::TestWithParam<Target> Reproducibility_SqueezeNet_v1_1;
 TEST_P(Reproducibility_SqueezeNet_v1_1, Accuracy)
 {
     Net net = readNetFromCaffe(findDataFile("dnn/squeezenet_v1.1.prototxt", false),
@@ -413,7 +413,7 @@ TEST(Test_Caffe, multiple_inputs)
     normAssert(out, first_image + second_image);
 }
 
-typedef testing::TestWithParam<tuple<std::string, DNNTarget> > opencv_face_detector;
+typedef testing::TestWithParam<tuple<std::string, Target> > opencv_face_detector;
 TEST_P(opencv_face_detector, Accuracy)
 {
     std::string proto = findDataFile("dnn/opencv_face_detector.prototxt", false);
diff --git a/modules/dnn/test/test_googlenet.cpp b/modules/dnn/test/test_googlenet.cpp
index a2ea731acc..37064c35c4 100644
--- a/modules/dnn/test/test_googlenet.cpp
+++ b/modules/dnn/test/test_googlenet.cpp
@@ -52,7 +52,7 @@ static std::string _tf(TString filename)
     return (getOpenCVExtraDir() + "/dnn/") + filename;
 }
 
-typedef testing::TestWithParam<DNNTarget> Reproducibility_GoogLeNet;
+typedef testing::TestWithParam<Target> Reproducibility_GoogLeNet;
 TEST_P(Reproducibility_GoogLeNet, Batching)
 {
     Net net = readNetFromCaffe(findDataFile("dnn/bvlc_googlenet.prototxt", false),
diff --git a/modules/dnn/test/test_halide_layers.cpp b/modules/dnn/test/test_halide_layers.cpp
index b21ae85800..c61f7e378a 100644
--- a/modules/dnn/test/test_halide_layers.cpp
+++ b/modules/dnn/test/test_halide_layers.cpp
@@ -41,21 +41,21 @@ static void test(LayerParams& params, Mat& input, int backendId, int targetId)
     test(input, net, backendId, targetId);
 }
 
-static testing::internal::ParamGenerator<tuple<DNNBackend, DNNTarget> > dnnBackendsAndTargetsWithHalide()
+static testing::internal::ParamGenerator<tuple<Backend, Target> > dnnBackendsAndTargetsWithHalide()
 {
-    static const tuple<DNNBackend, DNNTarget> testCases[] = {
+    static const tuple<Backend, Target> testCases[] = {
 #ifdef HAVE_HALIDE
-        tuple<DNNBackend, DNNTarget>(DNN_BACKEND_HALIDE, DNN_TARGET_CPU),
-        tuple<DNNBackend, DNNTarget>(DNN_BACKEND_HALIDE, DNN_TARGET_OPENCL),
+        tuple<Backend, Target>(DNN_BACKEND_HALIDE, DNN_TARGET_CPU),
+        tuple<Backend, Target>(DNN_BACKEND_HALIDE, DNN_TARGET_OPENCL),
 #endif
 #ifdef HAVE_INF_ENGINE
-        tuple<DNNBackend, DNNTarget>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_CPU),
-        tuple<DNNBackend, DNNTarget>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL),
-        tuple<DNNBackend, DNNTarget>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL_FP16),
-        tuple<DNNBackend, DNNTarget>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_MYRIAD),
+        tuple<Backend, Target>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_CPU),
+        tuple<Backend, Target>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL),
+        tuple<Backend, Target>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL_FP16),
+        tuple<Backend, Target>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_MYRIAD),
 #endif
-        tuple<DNNBackend, DNNTarget>(DNN_BACKEND_OPENCV, DNN_TARGET_OPENCL),
-        tuple<DNNBackend, DNNTarget>(DNN_BACKEND_OPENCV, DNN_TARGET_OPENCL_FP16)
+        tuple<Backend, Target>(DNN_BACKEND_OPENCV, DNN_TARGET_OPENCL),
+        tuple<Backend, Target>(DNN_BACKEND_OPENCV, DNN_TARGET_OPENCL_FP16)
     };
     return testing::ValuesIn(testCases);
 }
@@ -89,7 +89,7 @@ TEST_P(Test_Halide_layers, Padding)
 ////////////////////////////////////////////////////////////////////////////////
 // Convolution
 ////////////////////////////////////////////////////////////////////////////////
-typedef TestWithParam<tuple<Vec3i, Size, Size, Size, Size, Size, bool, tuple<DNNBackend, DNNTarget> > > Convolution;
+typedef TestWithParam<tuple<Vec3i, Size, Size, Size, Size, Size, bool, tuple<Backend, Target> > > Convolution;
 TEST_P(Convolution, Accuracy)
 {
     int inChannels = get<0>(GetParam())[0];
@@ -154,7 +154,7 @@ INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, Convolution, Combine(
 ////////////////////////////////////////////////////////////////////////////////
 // Deconvolution
 ////////////////////////////////////////////////////////////////////////////////
-typedef TestWithParam<tuple<Vec3i, Size, Size, Size, Size, Vec4i, bool, tuple<DNNBackend, DNNTarget> > > Deconvolution;
+typedef TestWithParam<tuple<Vec3i, Size, Size, Size, Size, Vec4i, bool, tuple<Backend, Target> > > Deconvolution;
 TEST_P(Deconvolution, Accuracy)
 {
     int inChannels = get<0>(GetParam())[0];
@@ -220,7 +220,7 @@ INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, Deconvolution, Combine(
 ////////////////////////////////////////////////////////////////////////////////
 // LRN
 ////////////////////////////////////////////////////////////////////////////////
-typedef TestWithParam<tuple<Vec3i, int, Vec3f, bool, std::string, tuple<DNNBackend, DNNTarget> > > LRN;
+typedef TestWithParam<tuple<Vec3i, int, Vec3f, bool, std::string, tuple<Backend, Target> > > LRN;
 TEST_P(LRN, Accuracy)
 {
     int inChannels = get<0>(GetParam())[0];
@@ -265,7 +265,7 @@ INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, LRN, Combine(
 ////////////////////////////////////////////////////////////////////////////////
 // Average pooling
 ////////////////////////////////////////////////////////////////////////////////
-typedef TestWithParam<tuple<int, Size, Size, Size, tuple<DNNBackend, DNNTarget> > > AvePooling;
+typedef TestWithParam<tuple<int, Size, Size, Size, tuple<Backend, Target> > > AvePooling;
 TEST_P(AvePooling, Accuracy)
 {
     int inChannels = get<0>(GetParam());
@@ -305,7 +305,7 @@ INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, AvePooling, Combine(
 ////////////////////////////////////////////////////////////////////////////////
 // Maximum pooling
 ////////////////////////////////////////////////////////////////////////////////
-typedef TestWithParam<tuple<int, Size, Size, Size, Size, tuple<DNNBackend, DNNTarget> > > MaxPooling;
+typedef TestWithParam<tuple<int, Size, Size, Size, Size, tuple<Backend, Target> > > MaxPooling;
 TEST_P(MaxPooling, Accuracy)
 {
     int inChannels = get<0>(GetParam());
@@ -344,7 +344,7 @@ INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, MaxPooling, Combine(
 ////////////////////////////////////////////////////////////////////////////////
 // Fully-connected
 ////////////////////////////////////////////////////////////////////////////////
-typedef TestWithParam<tuple<int, Size, int, bool, tuple<DNNBackend, DNNTarget> > > FullyConnected;
+typedef TestWithParam<tuple<int, Size, int, bool, tuple<Backend, Target> > > FullyConnected;
 TEST_P(FullyConnected, Accuracy)
 {
     int inChannels = get<0>(GetParam());
@@ -387,7 +387,7 @@ INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, FullyConnected, Combine(
 ////////////////////////////////////////////////////////////////////////////////
 // SoftMax
 ////////////////////////////////////////////////////////////////////////////////
-typedef TestWithParam<tuple<int,  tuple<DNNBackend, DNNTarget> > > SoftMax;
+typedef TestWithParam<tuple<int,  tuple<Backend, Target> > > SoftMax;
 TEST_P(SoftMax, Accuracy)
 {
     int inChannels = get<0>(GetParam());
@@ -476,7 +476,7 @@ void testInPlaceActivation(LayerParams& lp, int backendId, int targetId)
     test(input, net, backendId, targetId);
 }
 
-typedef TestWithParam<tuple<bool, bool, float, tuple<DNNBackend, DNNTarget> > > BatchNorm;
+typedef TestWithParam<tuple<bool, bool, float, tuple<Backend, Target> > > BatchNorm;
 TEST_P(BatchNorm, Accuracy)
 {
     bool hasWeights = get<0>(GetParam());
@@ -511,7 +511,7 @@ INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, BatchNorm, Combine(
                 dnnBackendsAndTargetsWithHalide()
 ));
 
-typedef TestWithParam<tuple<float, tuple<DNNBackend, DNNTarget> > > ReLU;
+typedef TestWithParam<tuple<float, tuple<Backend, Target> > > ReLU;
 TEST_P(ReLU, Accuracy)
 {
     float negativeSlope = get<0>(GetParam());
@@ -530,7 +530,7 @@ INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, ReLU, Combine(
                    dnnBackendsAndTargetsWithHalide()
 ));
 
-typedef TestWithParam<tuple<std::string, tuple<DNNBackend, DNNTarget> > > NoParamActivation;
+typedef TestWithParam<tuple<std::string, tuple<Backend, Target> > > NoParamActivation;
 TEST_P(NoParamActivation, Accuracy)
 {
     int backendId = get<0>(get<1>(GetParam()));
@@ -546,7 +546,7 @@ INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, NoParamActivation, Combine(
          dnnBackendsAndTargetsWithHalide()
 ));
 
-typedef TestWithParam<tuple<Vec3f, tuple<DNNBackend, DNNTarget> > > Power;
+typedef TestWithParam<tuple<Vec3f, tuple<Backend, Target> > > Power;
 TEST_P(Power, Accuracy)
 {
     float power = get<0>(GetParam())[0];
@@ -582,7 +582,7 @@ TEST_P(Test_Halide_layers, ChannelsPReLU)
     testInPlaceActivation(lp, backend, target);
 }
 
-typedef TestWithParam<tuple<bool, tuple<DNNBackend, DNNTarget> > > Scale;
+typedef TestWithParam<tuple<bool, tuple<Backend, Target> > > Scale;
 TEST_P(Scale, Accuracy)
 {
     bool hasBias = get<0>(GetParam());
@@ -616,7 +616,7 @@ INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, Scale, Combine(
 //      `--- conv ----^ ^ ^
 //      `---- ... ------' '
 //      `-----------------'
-typedef TestWithParam<tuple<Vec3i, Vec3i, tuple<DNNBackend, DNNTarget> > > Concat;
+typedef TestWithParam<tuple<Vec3i, Vec3i, tuple<Backend, Target> > > Concat;
 TEST_P(Concat, Accuracy)
 {
     Vec3i inSize = get<0>(GetParam());
@@ -682,7 +682,7 @@ INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, Concat, Combine(
 //      `--- conv ----^ ^ ^
 //      `---- ... ------' '
 //      `-----------------'
-typedef TestWithParam<tuple<Vec3i, std::string, int, bool, tuple<DNNBackend, DNNTarget> > > Eltwise;
+typedef TestWithParam<tuple<Vec3i, std::string, int, bool, tuple<Backend, Target> > > Eltwise;
 TEST_P(Eltwise, Accuracy)
 {
     Vec3i inSize = get<0>(GetParam());
diff --git a/modules/dnn/test/test_precomp.hpp b/modules/dnn/test/test_precomp.hpp
index 16f1e5c846..a0a46ab271 100644
--- a/modules/dnn/test/test_precomp.hpp
+++ b/modules/dnn/test/test_precomp.hpp
@@ -49,15 +49,41 @@
 #include "opencv2/dnn.hpp"
 #include "test_common.hpp"
 
-namespace opencv_test { namespace {
-using namespace cv::dnn;
+namespace cv {
+namespace dnn {
+CV__DNN_EXPERIMENTAL_NS_BEGIN
+
+static inline void PrintTo(const cv::dnn::Backend& v, std::ostream* os)
+{
+    switch (v) {
+    case DNN_BACKEND_DEFAULT: *os << "DNN_BACKEND_DEFAULT"; return;
+    case DNN_BACKEND_HALIDE: *os << "DNN_BACKEND_HALIDE"; return;
+    case DNN_BACKEND_INFERENCE_ENGINE: *os << "DNN_BACKEND_INFERENCE_ENGINE"; return;
+    case DNN_BACKEND_OPENCV: *os << "DNN_BACKEND_OPENCV"; return;
+    } // don't use "default:" to emit compiler warnings
+    *os << "DNN_BACKEND_UNKNOWN(" << v << ")";
+}
+
+static inline void PrintTo(const cv::dnn::Target& v, std::ostream* os)
+{
+    switch (v) {
+    case DNN_TARGET_CPU: *os << "DNN_TARGET_CPU"; return;
+    case DNN_TARGET_OPENCL: *os << "DNN_TARGET_OPENCL"; return;
+    case DNN_TARGET_OPENCL_FP16: *os << "DNN_TARGET_OPENCL_FP16"; return;
+    case DNN_TARGET_MYRIAD: *os << "DNN_TARGET_MYRIAD"; return;
+    } // don't use "default:" to emit compiler warnings
+    *os << "DNN_TARGET_UNKNOWN(" << v << ")";
+}
 
-CV_ENUM(DNNBackend, DNN_BACKEND_DEFAULT, DNN_BACKEND_HALIDE, DNN_BACKEND_INFERENCE_ENGINE, DNN_BACKEND_OPENCV)
-CV_ENUM(DNNTarget, DNN_TARGET_CPU, DNN_TARGET_OPENCL, DNN_TARGET_OPENCL_FP16, DNN_TARGET_MYRIAD)
+CV__DNN_EXPERIMENTAL_NS_END
+}} // namespace
 
-static testing::internal::ParamGenerator<DNNTarget> availableDnnTargets()
+namespace opencv_test {
+using namespace cv::dnn;
+
+static testing::internal::ParamGenerator<Target> availableDnnTargets()
 {
-    static std::vector<DNNTarget> targets;
+    static std::vector<Target> targets;
     if (targets.empty())
     {
         targets.push_back(DNN_TARGET_CPU);
@@ -69,23 +95,23 @@ static testing::internal::ParamGenerator<DNNTarget> availableDnnTargets()
     return testing::ValuesIn(targets);
 }
 
-static testing::internal::ParamGenerator<tuple<DNNBackend, DNNTarget> > dnnBackendsAndTargets()
+static testing::internal::ParamGenerator<tuple<Backend, Target> > dnnBackendsAndTargets()
 {
-    static const tuple<DNNBackend, DNNTarget> testCases[] = {
+    static const tuple<Backend, Target> testCases[] = {
     #ifdef HAVE_INF_ENGINE
-        tuple<DNNBackend, DNNTarget>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_CPU),
-        tuple<DNNBackend, DNNTarget>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL),
-        tuple<DNNBackend, DNNTarget>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL_FP16),
-        tuple<DNNBackend, DNNTarget>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_MYRIAD),
+        tuple<Backend, Target>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_CPU),
+        tuple<Backend, Target>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL),
+        tuple<Backend, Target>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL_FP16),
+        tuple<Backend, Target>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_MYRIAD),
     #endif
-        tuple<DNNBackend, DNNTarget>(DNN_BACKEND_OPENCV, DNN_TARGET_CPU),
-        tuple<DNNBackend, DNNTarget>(DNN_BACKEND_OPENCV, DNN_TARGET_OPENCL),
-        tuple<DNNBackend, DNNTarget>(DNN_BACKEND_OPENCV, DNN_TARGET_OPENCL_FP16)
+        tuple<Backend, Target>(DNN_BACKEND_OPENCV, DNN_TARGET_CPU),
+        tuple<Backend, Target>(DNN_BACKEND_OPENCV, DNN_TARGET_OPENCL),
+        tuple<Backend, Target>(DNN_BACKEND_OPENCV, DNN_TARGET_OPENCL_FP16)
     };
     return testing::ValuesIn(testCases);
 }
 
-class DNNTestLayer : public TestWithParam <tuple<DNNBackend, DNNTarget> >
+class DNNTestLayer : public TestWithParam<tuple<Backend, Target> >
 {
 public:
     dnn::Backend backend;
@@ -156,6 +182,5 @@ protected:
     }
 };
 
-}}
-
+} // namespace
 #endif
diff --git a/modules/dnn/test/test_tf_importer.cpp b/modules/dnn/test/test_tf_importer.cpp
index 8f3822cc96..66b9d4f642 100644
--- a/modules/dnn/test/test_tf_importer.cpp
+++ b/modules/dnn/test/test_tf_importer.cpp
@@ -243,7 +243,7 @@ TEST_P(Test_TensorFlow_layers, l2_normalize_3d)
     runTensorFlowNet("l2_normalize_3d");
 }
 
-typedef testing::TestWithParam<DNNTarget> Test_TensorFlow_nets;
+typedef testing::TestWithParam<Target> Test_TensorFlow_nets;
 
 TEST_P(Test_TensorFlow_nets, MobileNet_SSD)
 {
diff --git a/modules/dnn/test/test_torch_importer.cpp b/modules/dnn/test/test_torch_importer.cpp
index c1abdc930d..37966a1f93 100644
--- a/modules/dnn/test/test_torch_importer.cpp
+++ b/modules/dnn/test/test_torch_importer.cpp
@@ -100,7 +100,7 @@ static void runTorchNet(String prefix, int targetId = DNN_TARGET_CPU, String out
     }
 }
 
-typedef testing::TestWithParam<DNNTarget> Test_Torch_layers;
+typedef testing::TestWithParam<Target> Test_Torch_layers;
 
 TEST_P(Test_Torch_layers, run_convolution)
 {
@@ -208,7 +208,7 @@ TEST_P(Test_Torch_layers, net_non_spatial)
 
 INSTANTIATE_TEST_CASE_P(/**/, Test_Torch_layers, availableDnnTargets());
 
-typedef testing::TestWithParam<DNNTarget> Test_Torch_nets;
+typedef testing::TestWithParam<Target> Test_Torch_nets;
 
 TEST_P(Test_Torch_nets, OpenFace_accuracy)
 {

From d7bd662c95084f4d559feb7fa7ee75bdf0b02cc4 Mon Sep 17 00:00:00 2001
From: catree <catree.catreus@outlook.com>
Date: Tue, 10 Jul 2018 15:35:46 +0200
Subject: [PATCH 2/3] Add a note in the documentation about Mat::ones and
 mat::eye. With multi-channels type (e.g. CV_8UC3), only the first channel is
 treated.

---
 modules/core/include/opencv2/core/mat.hpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/modules/core/include/opencv2/core/mat.hpp b/modules/core/include/opencv2/core/mat.hpp
index 7be97c803e..2d1d8e08f4 100644
--- a/modules/core/include/opencv2/core/mat.hpp
+++ b/modules/core/include/opencv2/core/mat.hpp
@@ -1324,7 +1324,7 @@ public:
     /** @brief Returns a zero array of the specified size and type.
 
     The method returns a Matlab-style zero array initializer. It can be used to quickly form a constant
-    array as a function parameter, part of a matrix expression, or as a matrix initializer. :
+    array as a function parameter, part of a matrix expression, or as a matrix initializer:
     @code
         Mat A;
         A = Mat::zeros(3, 3, CV_32F);
@@ -1360,6 +1360,8 @@ public:
     The above operation does not form a 100x100 matrix of 1's and then multiply it by 3. Instead, it
     just remembers the scale factor (3 in this case) and use it when actually invoking the matrix
     initializer.
+    @note In case of multi-channels type, only the first channel will be initialized with 1's, the
+    others will be set to 0's.
     @param rows Number of rows.
     @param cols Number of columns.
     @param type Created matrix type.
@@ -1387,6 +1389,8 @@ public:
         // make a 4x4 diagonal matrix with 0.1's on the diagonal.
         Mat A = Mat::eye(4, 4, CV_32F)*0.1;
     @endcode
+    @note In case of multi-channels type, identity matrix will be initialized only for the first channel,
+    the others will be set to 0's
     @param rows Number of rows.
     @param cols Number of columns.
     @param type Created matrix type.

From 4c5a86828a81cc3445c102004591157f28f331b3 Mon Sep 17 00:00:00 2001
From: Li Peng <peng.li@intel.com>
Date: Tue, 10 Jul 2018 12:43:03 +0800
Subject: [PATCH 3/3] Fix gemmlike convolution input reading

use vload3 for half3 or float3 input vector reading,
also check read position to see if it exceed input width

Signed-off-by: Li Peng <peng.li@intel.com>
---
 modules/dnn/src/opencl/conv_layer_spatial.cl | 169 +++++++++++++------
 1 file changed, 115 insertions(+), 54 deletions(-)

diff --git a/modules/dnn/src/opencl/conv_layer_spatial.cl b/modules/dnn/src/opencl/conv_layer_spatial.cl
index dc7b047fe5..2cc161d3ff 100644
--- a/modules/dnn/src/opencl/conv_layer_spatial.cl
+++ b/modules/dnn/src/opencl/conv_layer_spatial.cl
@@ -467,7 +467,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
         int saved_y = curr_y;
 #endif
         const __global Dtype *src0_read = src0
-          + aligned_input_size * global_z                            // batch offset
+          + aligned_input_size * global_z           // batch offset
           + (curr_y - INPUT_PAD_H) * ROW_PITCH      // y offset
           + (curr_x - INPUT_PAD_W);                 // x offset
 
@@ -502,15 +502,23 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
                 const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;
 
 #if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0
+  #if KERNEL_WIDTH == 3
+                Dtype_t blockA00 = vload3(0, src0_read);
+                Dtype*  pblockA00 = (Dtype*)(&blockA00);
+  #else
                 Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[  0  ];
                 Dtype*  pblockA00 = (Dtype*)(&blockA00);
+  #endif
 #else
                 Dtype_t blockA00;
                 Dtype*  pblockA00 = (Dtype*)(&blockA00);
                 int pos = 0;
                 LOOP(KERNEL_WIDTH, pos,
                 {
-                  if (curr_y >= INPUT_PAD_H && curr_y < input_height + INPUT_PAD_H && curr_x + pos * DILATION_X >= INPUT_PAD_W && curr_x + pos * DILATION_X < input_width + INPUT_PAD_W)
+                  if (curr_y >= INPUT_PAD_H &&
+                      curr_y < input_height + INPUT_PAD_H &&
+                      curr_x + pos * DILATION_X >= INPUT_PAD_W &&
+                      curr_x + pos * DILATION_X < input_width + INPUT_PAD_W)
                     pblockA00[pos] = src0_read[pos * DILATION_X];
                   else
                     pblockA00[pos] = 0;
@@ -564,17 +572,18 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
             //while( ++patch_row < 1 ); //debug
             while( ++patch_row < KERNEL_HEIGHT );
 
-            src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y); // reset to start of next slice of patch
+            // reset to start of next slice of patch
+            src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y);
         }
         //while ( ++patch_depth < 1 ); //debug
         while ( ++patch_depth < INPUT_DEPTH );
 
         // Dst resembles a cube of width x height x (output channel * batches).  Each tile writes:
         // (SIMD * TILE_M) x 1 x TILE_N.  Partial writes most likely generated if padding used.
-        int out_offset = global_z * out_pitch_z                                                   // batch offset
-         + ( group_x * TILE_N ) * out_pitch_y                                       // channel offset
+        int out_offset = global_z * out_pitch_z                                        // batch offset
+         + ( group_x * TILE_N ) * out_pitch_y                                          // channel offset
          + ( ( global_y * TILE_M ) / output_width + OUT_PADDING_HEIGHT) * OUT_PITCH_X  // y offset
-         + ( ( global_y * TILE_M ) % output_width ) + OUT_PADDING_LEFT;               // x offset
+         + ( ( global_y * TILE_M ) % output_width ) + OUT_PADDING_LEFT;                // x offset
 
         __global Dtype *out = dst + out_offset;
 #if APPLY_BIAS
@@ -621,7 +630,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
         int saved_y = curr_y;
 #endif
         const __global Dtype *src0_read = src0
-          + aligned_input_size * global_z                            // batch offset
+          + aligned_input_size * global_z           // batch offset
           + (curr_y - INPUT_PAD_H) * ROW_PITCH      // y offset
           + (curr_x - INPUT_PAD_W);                 // x offset
 
@@ -653,7 +662,10 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
                 int pos = 0;
                 LOOP(KERNEL_WIDTH, pos,
                 {
-                  if (curr_y >= INPUT_PAD_H && curr_y < input_height + INPUT_PAD_H && curr_x + pos * DILATION_X >= INPUT_PAD_W && curr_x + pos * DILATION_X < input_width + INPUT_PAD_W)
+                  if (curr_y >= INPUT_PAD_H &&
+                      curr_y < input_height + INPUT_PAD_H &&
+                      curr_x + pos * DILATION_X >= INPUT_PAD_W &&
+                      curr_x + pos * DILATION_X < input_width + INPUT_PAD_W)
                     pblockA00[pos] = src0_read[pos * DILATION_X];
                   else
                     pblockA00[pos] = 0;
@@ -730,17 +742,18 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
             //while( ++patch_row < 1 ); //debug
             while( ++patch_row < KERNEL_HEIGHT );
 
-            src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); // reset to start of next slice of patch
+            // reset to start of next slice of patch
+            src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y );
         }
         //while ( ++patch_depth < 1 );  //debug
         while ( ++patch_depth < INPUT_DEPTH );
 
         // Dst resembles a cube of width x height x (output channel * batches).  Each tile writes:
         // (SIMD * TILE_M) x 1 x TILE_N.  Partial writes most likely generated if padding used.
-        int out_offset = global_z * out_pitch_z                                                   // batch offset
-         + ( group_x * TILE_N ) * out_pitch_y                                       // channel offset
+        int out_offset = global_z * out_pitch_z                                        // batch offset
+         + ( group_x * TILE_N ) * out_pitch_y                                          // channel offset
          + ( ( global_y * TILE_M ) / output_width + OUT_PADDING_HEIGHT) * OUT_PITCH_X  // y offset
-         + ( ( global_y * TILE_M ) % output_width ) + OUT_PADDING_LEFT;               // x offset
+         + ( ( global_y * TILE_M ) % output_width ) + OUT_PADDING_LEFT;                // x offset
         __global Dtype *out = dst + out_offset;
 #if APPLY_BIAS
         Dtype bias[4];
@@ -849,11 +862,11 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
         int saved_y1 = curr_y1;
 #endif
         const __global Dtype *src0_read0 = src0
-         + aligned_input_size * global_z                                            // batch offset
+         + aligned_input_size * global_z         // batch offset
          + (curr_y0 - INPUT_PAD_H) * ROW_PITCH   // y offset
          + curr_x0 - INPUT_PAD_W;                // x offset
         const __global Dtype *src0_read1 = src0
-         + aligned_input_size * global_z                                            // batch offset
+         + aligned_input_size * global_z         // batch offset
          + (curr_y1 - INPUT_PAD_H) * ROW_PITCH   // y offset
          + curr_x1 - INPUT_PAD_W;                // x offset
 
@@ -883,17 +896,38 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
                 // ...
                 const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;
 #if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0
-                Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read0 )[  0  ]; src0_read0 += ROW_PITCH;
-                Dtype_t blockA01 = ( (const __global Dtype_t*)src0_read1 )[  0  ]; src0_read1 += ROW_PITCH;
+  #if KERNEL_WIDTH == 3
+                Dtype_t blockA00 = vload3(0, src0_read0); src0_read0 += ROW_PITCH;
+                Dtype_t blockA01 = vload3(0, src0_read1); src0_read1 += ROW_PITCH;
                 Dtype*  pblockA00 = (Dtype*)(&blockA00);
                 Dtype*  pblockA01 = (Dtype*)(&blockA01);
+  #else
+                Dtype_t blockA00 = { (Dtype)0.f };
+                Dtype_t blockA01 = { (Dtype)0.f };
+                Dtype*  pblockA00 = (Dtype*)(&blockA00);
+                Dtype*  pblockA01 = (Dtype*)(&blockA01);
+                int pos = 0;
+                LOOP(KERNEL_WIDTH, pos,
+                {
+                  if (curr_x0 + pos < input_width)
+                    pblockA00[pos] = src0_read0[pos];
+
+                  if (curr_x1 + pos < input_width)
+                    pblockA01[pos] = src0_read1[pos];
+                })
+                src0_read0 += ROW_PITCH;
+                src0_read1 += ROW_PITCH;
+  #endif
 #else
                 Dtype_t blockA00;
                 Dtype*  pblockA00 = (Dtype*)(&blockA00);
                 int pos = 0;
                 LOOP(KERNEL_WIDTH, pos,
                 {
-                  if (curr_y0 >= INPUT_PAD_H && curr_y0 < input_height + INPUT_PAD_H && curr_x0 + pos * DILATION_X >= INPUT_PAD_W && curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W)
+                  if (curr_y0 >= INPUT_PAD_H &&
+                      curr_y0 < input_height + INPUT_PAD_H &&
+                      curr_x0 + pos * DILATION_X >= INPUT_PAD_W &&
+                      curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W)
                     pblockA00[pos] = src0_read0[pos * DILATION_X];
                   else
                     pblockA00[pos] = 0;
@@ -904,7 +938,10 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
                 pos = 0;
                 LOOP(KERNEL_WIDTH, pos,
                 {
-                  if (curr_y1 >= INPUT_PAD_H && curr_y1 < input_height + INPUT_PAD_H && curr_x1 + pos * DILATION_X >= INPUT_PAD_W && curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W)
+                  if (curr_y1 >= INPUT_PAD_H &&
+                      curr_y1 < input_height + INPUT_PAD_H &&
+                      curr_x1 + pos * DILATION_X >= INPUT_PAD_W &&
+                      curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W)
                     pblockA01[pos] = src0_read1[pos * DILATION_X];
                   else
                     pblockA01[pos] = 0;
@@ -972,7 +1009,8 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
             curr_y0 = saved_y0;
             curr_y1 = saved_y1;
 #endif
-            src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); // reset to start of next slice of patch
+            // reset to start of next slice of patch
+            src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y );
             src0_read1 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y );
         }
         //while ( ++patch_depth < 1 );  //debug
@@ -980,14 +1018,14 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
 
         // Dst resembles a cube of width x height x (output channel * batches).  Each tile writes:
         // (SIMD * TILE_M) x 1 x TILE_N.  Partial writes most likely generated if padding used.
-        int out0_offset = global_z * out_pitch_z                                                       // batch offset
-         + ( group_x * TILE_N ) * out_pitch_y                                           // channel offset
+        int out0_offset = global_z * out_pitch_z                                           // batch offset
+         + ( group_x * TILE_N ) * out_pitch_y                                              // channel offset
          + ( ( global_y * TILE_M + 0 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset
-         + ( ( global_y * TILE_M + 0 ) % output_width ) + OUT_PADDING_LEFT;               // x offset
-        int out1_offset = global_z * out_pitch_z                                                       // batch offset
-         + ( group_x * TILE_N ) * out_pitch_y                                           // channel offset
+         + ( ( global_y * TILE_M + 0 ) % output_width ) + OUT_PADDING_LEFT;                // x offset
+        int out1_offset = global_z * out_pitch_z                                           // batch offset
+         + ( group_x * TILE_N ) * out_pitch_y                                              // channel offset
          + ( ( global_y * TILE_M + 1 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset
-         + ( ( global_y * TILE_M + 1 ) % output_width ) + OUT_PADDING_LEFT;               // x offset
+         + ( ( global_y * TILE_M + 1 ) % output_width ) + OUT_PADDING_LEFT;                // x offset
 
 #if APPLY_BIAS
         Dtype bias[4];
@@ -1049,11 +1087,11 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
         int saved_y1 = curr_y1;
 #endif
         const __global Dtype *src0_read0 = src0
-         + aligned_input_size * global_z                                            // batch offset
+         + aligned_input_size * global_z         // batch offset
          + (curr_y0 - INPUT_PAD_H) * ROW_PITCH   // y offset
          + curr_x0 - INPUT_PAD_W;                // x offset
         const __global Dtype *src0_read1 = src0
-         + aligned_input_size * global_z                                            // batch offset
+         + aligned_input_size * global_z         // batch offset
          + (curr_y1 - INPUT_PAD_H) * ROW_PITCH   // y offset
          + curr_x1 - INPUT_PAD_W;                // x offset
 
@@ -1084,7 +1122,10 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
                 int pos = 0;
                 LOOP(KERNEL_WIDTH, pos,
                 {
-                  if (curr_y0 >= INPUT_PAD_H && curr_y0 < input_height + INPUT_PAD_H && curr_x0 + pos * DILATION_X >= INPUT_PAD_W && curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W)
+                  if (curr_y0 >= INPUT_PAD_H &&
+                      curr_y0 < input_height + INPUT_PAD_H &&
+                      curr_x0 + pos * DILATION_X >= INPUT_PAD_W &&
+                      curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W)
                     pblockA00[pos] = src0_read0[pos * DILATION_X];
                   else
                     pblockA00[pos] = 0;
@@ -1095,7 +1136,10 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
                 pos = 0;
                 LOOP(KERNEL_WIDTH, pos,
                 {
-                  if (curr_y1 >= INPUT_PAD_H && curr_y1 < input_height + INPUT_PAD_H && curr_x1 + pos * DILATION_X >= INPUT_PAD_W && curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W)
+                  if (curr_y1 >= INPUT_PAD_H &&
+                      curr_y1 < input_height + INPUT_PAD_H &&
+                      curr_x1 + pos * DILATION_X >= INPUT_PAD_W &&
+                      curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W)
                     pblockA01[pos] = src0_read1[pos * DILATION_X];
                   else
                     pblockA01[pos] = 0;
@@ -1185,7 +1229,8 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
             curr_y0 = saved_y0;
             curr_y1 = saved_y1;
 #endif
-            src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); // reset to start of next slice of patch
+            // reset to start of next slice of patch
+            src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y );
             src0_read1 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y );
         }
         //while ( ++patch_depth < 1 );  //debug
@@ -1193,14 +1238,14 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
 
         // Dst resembles a cube of width x height x (output channel * batches).  Each tile writes:
         // (SIMD * TILE_M) x 1 x TILE_N.  Partial writes most likely generated if padding used.
-        int out0_offset = global_z * out_pitch_z                                                       // batch offset
-         + ( group_x * TILE_N ) * out_pitch_y                                           // channel offset
+        int out0_offset = global_z * out_pitch_z                                           // batch offset
+         + ( group_x * TILE_N ) * out_pitch_y                                              // channel offset
          + ( ( global_y * TILE_M + 0 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset
-         + ( ( global_y * TILE_M + 0 ) % output_width ) + OUT_PADDING_LEFT;               // x offset
-        int out1_offset = global_z * out_pitch_z                                                       // batch offset
-         + ( group_x * TILE_N ) * out_pitch_y                                           // channel offset
+         + ( ( global_y * TILE_M + 0 ) % output_width ) + OUT_PADDING_LEFT;                // x offset
+        int out1_offset = global_z * out_pitch_z                                           // batch offset
+         + ( group_x * TILE_N ) * out_pitch_y                                              // channel offset
          + ( ( global_y * TILE_M + 1 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset
-         + ( ( global_y * TILE_M + 1 ) % output_width ) + OUT_PADDING_LEFT;               // x offset
+         + ( ( global_y * TILE_M + 1 ) % output_width ) + OUT_PADDING_LEFT;                // x offset
         __global Dtype *out1 = dst + out1_offset;
 
 #if APPLY_BIAS
@@ -1352,9 +1397,9 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
     int saved_y = curr_y;
 #endif
     const __global Dtype *src0_read = src0
-     + aligned_input_size * global_z                            // batch offset
+     + aligned_input_size * global_z           // batch offset
      + (curr_y - INPUT_PAD_H) * ROW_PITCH      // y offset
-     + curr_x - INPUT_PAD_W;                 // x offset
+     + curr_x - INPUT_PAD_W;                   // x offset
      const __global Dtype *src0_read_orig = src0_read;
 
     // Src1 (filter) is directly used as btile.
@@ -1409,15 +1454,23 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
             const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;
 
 #if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0
+  #if KERNEL_WIDTH == 3
+            Dtype_t blockA00 = vload3(0, src0_read);
+            Dtype*  pblockA00 = (Dtype*)(&blockA00);
+  #else
             Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[  0  ];
             Dtype*  pblockA00 = (Dtype*)(&blockA00);
+  #endif
 #else
             Dtype_t blockA00;
             Dtype*  pblockA00 = (Dtype*)(&blockA00);
             int pos = 0;
             LOOP(KERNEL_WIDTH, pos,
             {
-              if (curr_y >= INPUT_PAD_H && curr_y < input_height + INPUT_PAD_H && curr_x + pos * DILATION_X >= INPUT_PAD_W && curr_x + pos * DILATION_X < input_width + INPUT_PAD_W)
+              if (curr_y >= INPUT_PAD_H &&
+                  curr_y < input_height + INPUT_PAD_H &&
+                  curr_x + pos * DILATION_X >= INPUT_PAD_W &&
+                  curr_x + pos * DILATION_X < input_width + INPUT_PAD_W)
                 pblockA00[pos] = src0_read[pos * DILATION_X];
               else
                 pblockA00[pos] = 0;
@@ -1463,17 +1516,18 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
         //while( ++patch_row < 1 ); //debug
         while( ++patch_row < KERNEL_HEIGHT );
 
-        src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); // reset to start of next slice of patch
+        // reset to start of next slice of patch
+        src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y );
     }
     //while ( ++patch_depth < 1 );  //debug
     while ( ++patch_depth < INPUT_DEPTH );
 
     // Dst resembles a cube of width x height x (output channel * batches).  Each tile writes:
     // (SIMD * TILE_M) x 1 x TILE_N.  Partial writes most likely generated if padding used.
-    int out_offset = global_z * out_pitch_z                                                   // batch offset
-     + ( group_x * TILE_N ) * out_pitch_y                                       // channel offset
+    int out_offset = global_z * out_pitch_z                                        // batch offset
+     + ( group_x * TILE_N ) * out_pitch_y                                          // channel offset
      + ( ( global_y * TILE_M ) / output_width + OUT_PADDING_HEIGHT) * OUT_PITCH_X  // y offset
-     + ( ( global_y * TILE_M ) % output_width ) + OUT_PADDING_LEFT;               // x offset
+     + ( ( global_y * TILE_M ) % output_width ) + OUT_PADDING_LEFT;                // x offset
     __global Dtype *out = dst + out_offset;
 
 #if APPLY_BIAS
@@ -1556,11 +1610,11 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
         int saved_y1 = curr_y1;
 #endif
         const __global Dtype *src0_read0 = src0
-         + aligned_input_size * global_z                                            // batch offset
+         + aligned_input_size * global_z         // batch offset
          + (curr_y0 - INPUT_PAD_H) * ROW_PITCH   // y offset
          + curr_x0 - INPUT_PAD_W;                // x offset
         const __global Dtype *src0_read1 = src0
-         + aligned_input_size * global_z                                            // batch offset
+         + aligned_input_size * global_z         // batch offset
          + (curr_y1 - INPUT_PAD_H) * ROW_PITCH   // y offset
          + curr_x1 - INPUT_PAD_W;                // x offset
 
@@ -1600,7 +1654,10 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
                 int pos = 0;
                 LOOP(KERNEL_WIDTH, pos,
                 {
-                  if (curr_y0 >= INPUT_PAD_H && curr_y0 < input_height + INPUT_PAD_H && curr_x0 + pos * DILATION_X >= INPUT_PAD_W && curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W)
+                  if (curr_y0 >= INPUT_PAD_H &&
+                      curr_y0 < input_height + INPUT_PAD_H &&
+                      curr_x0 + pos * DILATION_X >= INPUT_PAD_W &&
+                      curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W)
                     pblockA00[pos] = src0_read0[pos * DILATION_X];
                   else
                     pblockA00[pos] = 0;
@@ -1611,7 +1668,10 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
                 pos = 0;
                 LOOP(KERNEL_WIDTH, pos,
                 {
-                  if (curr_y1 >= INPUT_PAD_H && curr_y1 < input_height + INPUT_PAD_H && curr_x1 + pos * DILATION_X >= INPUT_PAD_W && curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W)
+                  if (curr_y1 >= INPUT_PAD_H &&
+                      curr_y1 < input_height + INPUT_PAD_H &&
+                      curr_x1 + pos * DILATION_X >= INPUT_PAD_W &&
+                      curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W)
                     pblockA01[pos] = src0_read1[pos * DILATION_X];
                   else
                     pblockA01[pos] = 0;
@@ -1667,7 +1727,8 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
             curr_y0 = saved_y0;
             curr_y1 = saved_y1;
 #endif
-            src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y); // reset to start of next slice of patch
+            // reset to start of next slice of patch
+            src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y);
             src0_read1 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y);
         }
         //while ( ++patch_depth < 1 );  //debug
@@ -1675,14 +1736,14 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
 
         // Dst resembles a cube of width x height x (output channel * batches).  Each tile writes:
         // (SIMD * TILE_M) x 1 x TILE_N.  Partial writes most likely generated if padding used.
-        int out0_offset = global_z * out_pitch_z                                                       // batch offset
-         + ( group_x * TILE_N ) * out_pitch_y                                           // channel offset
+        int out0_offset = global_z * out_pitch_z                                           // batch offset
+         + ( group_x * TILE_N ) * out_pitch_y                                              // channel offset
          + ( ( global_y * TILE_M + 0 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset
-         + ( ( global_y * TILE_M + 0 ) % output_width ) + OUT_PADDING_LEFT;               // x offset
-        int out1_offset = global_z * out_pitch_z                                                       // batch offset
-         + ( group_x * TILE_N ) * out_pitch_y                                           // channel offset
+         + ( ( global_y * TILE_M + 0 ) % output_width ) + OUT_PADDING_LEFT;                // x offset
+        int out1_offset = global_z * out_pitch_z                                           // batch offset
+         + ( group_x * TILE_N ) * out_pitch_y                                              // channel offset
          + ( ( global_y * TILE_M + 1 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset
-         + ( ( global_y * TILE_M + 1 ) % output_width ) + OUT_PADDING_LEFT;               // x offset
+         + ( ( global_y * TILE_M + 1 ) % output_width ) + OUT_PADDING_LEFT;                // x offset
 
 #if APPLY_BIAS
         Dtype bias[2];