diff --git a/3rdparty/ippicv/ippicv.cmake b/3rdparty/ippicv/ippicv.cmake
index 751df33783..a54d8f11ae 100644
--- a/3rdparty/ippicv/ippicv.cmake
+++ b/3rdparty/ippicv/ippicv.cmake
@@ -2,37 +2,37 @@ function(download_ippicv root_var)
   set(${root_var} "" PARENT_SCOPE)
 
   # Commit SHA in the opencv_3rdparty repo
-  set(IPPICV_COMMIT "dfe3162c237af211e98b8960018b564bc209261d")
+  set(IPPICV_COMMIT "bdb7bb85f34a8cb0d35e40a81f58da431aa1557a")
   # Define actual ICV versions
   if(APPLE)
     set(OPENCV_ICV_PLATFORM "macosx")
     set(OPENCV_ICV_PACKAGE_SUBDIR "ippicv_mac")
     if(X86_64)
-      set(OPENCV_ICV_NAME "ippicv_2017u3_mac_intel64_general_20170822.tgz")
-      set(OPENCV_ICV_HASH "c1ebb5dfa5b7f54b0c44e1917805a463")
+      set(OPENCV_ICV_NAME "ippicv_2017u3_mac_intel64_general_20180518.tgz")
+      set(OPENCV_ICV_HASH "3ae52b9be0fe73dd45bc5e9429cd3732")
     else()
-      set(OPENCV_ICV_NAME "ippicv_2017u3_mac_ia32_general_20170822.tgz")
-      set(OPENCV_ICV_HASH "49b05a669042753ae75895a445ebd612")
+      set(OPENCV_ICV_NAME "ippicv_2017u3_mac_ia32_general_20180518.tgz")
+      set(OPENCV_ICV_HASH "698660b975b62bee3ef6c5af51e97544")
     endif()
   elseif((UNIX AND NOT ANDROID) OR (UNIX AND ANDROID_ABI MATCHES "x86"))
     set(OPENCV_ICV_PLATFORM "linux")
     set(OPENCV_ICV_PACKAGE_SUBDIR "ippicv_lnx")
     if(X86_64)
-      set(OPENCV_ICV_NAME "ippicv_2017u3_lnx_intel64_general_20170822.tgz")
-      set(OPENCV_ICV_HASH "4e0352ce96473837b1d671ce87f17359")
+      set(OPENCV_ICV_NAME "ippicv_2017u3_lnx_intel64_general_20180518.tgz")
+      set(OPENCV_ICV_HASH "b7cc351267db2d34b9efa1cd22ff0572")
     else()
-      set(OPENCV_ICV_NAME "ippicv_2017u3_lnx_ia32_general_20170822.tgz")
-      set(OPENCV_ICV_HASH "dcdb0ba4b123f240596db1840cd59a76")
+      set(OPENCV_ICV_NAME "ippicv_2017u3_lnx_ia32_general_20180518.tgz")
+      set(OPENCV_ICV_HASH "ea72de74dae3c604eb6348395366e78e")
     endif()
   elseif(WIN32 AND NOT ARM)
     set(OPENCV_ICV_PLATFORM "windows")
     set(OPENCV_ICV_PACKAGE_SUBDIR "ippicv_win")
     if(X86_64)
-      set(OPENCV_ICV_NAME "ippicv_2017u3_win_intel64_general_20170822.zip")
-      set(OPENCV_ICV_HASH "0421e642bc7ad741a2236d3ec4190bdd")
+      set(OPENCV_ICV_NAME "ippicv_2017u3_win_intel64_general_20180518.zip")
+      set(OPENCV_ICV_HASH "915ff92958089ede8ea532d3c4fe7187")
     else()
-      set(OPENCV_ICV_NAME "ippicv_2017u3_win_ia32_general_20170822.zip")
-      set(OPENCV_ICV_HASH "8a7680ae352c192de2e2e34936164bd0")
+      set(OPENCV_ICV_NAME "ippicv_2017u3_win_ia32_general_20180518.zip")
+      set(OPENCV_ICV_HASH "928168c2d99ab284047dfcfb7a821d91")
     endif()
   else()
     return()
diff --git a/3rdparty/libtiff/CMakeLists.txt b/3rdparty/libtiff/CMakeLists.txt
index 0e084d1a29..7767cf49b5 100644
--- a/3rdparty/libtiff/CMakeLists.txt
+++ b/3rdparty/libtiff/CMakeLists.txt
@@ -417,7 +417,7 @@ set(lib_srcs
     tif_write.c
     tif_zip.c
     tif_stream.cxx
-	snprintf.c
+    snprintf.c
     t4.h
     tif_dir.h
     tif_fax3.h
diff --git a/doc/tutorials/dnn/dnn_custom_layers/dnn_custom_layers.md b/doc/tutorials/dnn/dnn_custom_layers/dnn_custom_layers.md
index 5b3f3c7347..f367946620 100644
--- a/doc/tutorials/dnn/dnn_custom_layers/dnn_custom_layers.md
+++ b/doc/tutorials/dnn/dnn_custom_layers/dnn_custom_layers.md
@@ -32,11 +32,11 @@ Unspecified error: Can't create layer "layer_name" of type "MyType" in function
 To import the model correctly you have to derive a class from cv::dnn::Layer with
 the following methods:
 
-@snippet dnn/custom_layers.cpp A custom layer interface
+@snippet dnn/custom_layers.hpp A custom layer interface
 
 And register it before the import:
 
-@snippet dnn/custom_layers.cpp Register a custom layer
+@snippet dnn/custom_layers.hpp Register a custom layer
 
 @note `MyType` is a type of unimplemented layer from the thrown exception.
 
@@ -44,27 +44,27 @@ Let's see what all the methods do:
 
 - Constructor
 
-@snippet dnn/custom_layers.cpp MyLayer::MyLayer
+@snippet dnn/custom_layers.hpp MyLayer::MyLayer
 
 Retrieves hyper-parameters from cv::dnn::LayerParams. If your layer has trainable
 weights they will be already stored in the Layer's member cv::dnn::Layer::blobs.
 
 - A static method `create`
 
-@snippet dnn/custom_layers.cpp MyLayer::create
+@snippet dnn/custom_layers.hpp MyLayer::create
 
 This method should create an instance of you layer and return cv::Ptr with it.
 
 - Output blobs' shape computation
 
-@snippet dnn/custom_layers.cpp MyLayer::getMemoryShapes
+@snippet dnn/custom_layers.hpp MyLayer::getMemoryShapes
 
 Returns layer's output shapes depends on input shapes. You may request an extra
 memory using `internals`.
 
 - Run a layer
 
-@snippet dnn/custom_layers.cpp MyLayer::forward
+@snippet dnn/custom_layers.hpp MyLayer::forward
 
 Implement a layer's logic here. Compute outputs for given inputs.
 
@@ -74,7 +74,7 @@ the second invocation of `forward` will has the same data at `outputs` and `inte
 
 - Optional `finalize` method
 
-@snippet dnn/custom_layers.cpp MyLayer::finalize
+@snippet dnn/custom_layers.hpp MyLayer::finalize
 
 The chain of methods are the following: OpenCV deep learning engine calls `create`
 method once then it calls `getMemoryShapes` for an every created layer then you
@@ -108,11 +108,11 @@ layer {
 
 This way our implementation can look like:
 
-@snippet dnn/custom_layers.cpp InterpLayer
+@snippet dnn/custom_layers.hpp InterpLayer
 
 Next we need to register a new layer type and try to import the model.
 
-@snippet dnn/custom_layers.cpp Register InterpLayer
+@snippet dnn/custom_layers.hpp Register InterpLayer
 
 ## Example: custom layer from TensorFlow
 This is an example of how to import a network with [tf.image.resize_bilinear](https://www.tensorflow.org/versions/master/api_docs/python/tf/image/resize_bilinear)
@@ -185,11 +185,11 @@ Custom layers import from TensorFlow is designed to put all layer's `attr` into
 cv::dnn::LayerParams but input `Const` blobs into cv::dnn::Layer::blobs.
 In our case resize's output shape will be stored in layer's `blobs[0]`.
 
-@snippet dnn/custom_layers.cpp ResizeBilinearLayer
+@snippet dnn/custom_layers.hpp ResizeBilinearLayer
 
 Next we register a layer and try to import the model.
 
-@snippet dnn/custom_layers.cpp Register ResizeBilinearLayer
+@snippet dnn/custom_layers.hpp Register ResizeBilinearLayer
 
 ## Define a custom layer in Python
 The following example shows how to customize OpenCV's layers in Python.
diff --git a/doc/tutorials/highgui/table_of_content_highgui.markdown b/doc/tutorials/highgui/table_of_content_highgui.markdown
index a8f1d4e344..fb5a343664 100644
--- a/doc/tutorials/highgui/table_of_content_highgui.markdown
+++ b/doc/tutorials/highgui/table_of_content_highgui.markdown
@@ -5,6 +5,8 @@ This section contains tutorials about how to use the built-in graphical user int
 
 -   @subpage tutorial_trackbar
 
+    *Languages:* C++, Java, Python
+
     *Compatibility:* \> OpenCV 2.0
 
     *Author:* Ana Huamán
diff --git a/doc/tutorials/highgui/trackbar/trackbar.markdown b/doc/tutorials/highgui/trackbar/trackbar.markdown
index 13898712ab..d6700d6387 100644
--- a/doc/tutorials/highgui/trackbar/trackbar.markdown
+++ b/doc/tutorials/highgui/trackbar/trackbar.markdown
@@ -1,11 +1,11 @@
 Adding a Trackbar to our applications! {#tutorial_trackbar}
 ======================================
 
--   In the previous tutorials (about *linear blending* and the *brightness and contrast
-    adjustments*) you might have noted that we needed to give some **input** to our programs, such
-    as \f$\alpha\f$ and \f$beta\f$. We accomplished that by entering this data using the Terminal
--   Well, it is time to use some fancy GUI tools. OpenCV provides some GUI utilities (*highgui.hpp*)
-    for you. An example of this is a **Trackbar**
+-   In the previous tutorials (about @ref tutorial_adding_images and the @ref tutorial_basic_linear_transform)
+    you might have noted that we needed to give some **input** to our programs, such
+    as \f$\alpha\f$ and \f$beta\f$. We accomplished that by entering this data using the Terminal.
+-   Well, it is time to use some fancy GUI tools. OpenCV provides some GUI utilities (**highgui** module)
+    for you. An example of this is a **Trackbar**.
 
     ![](images/Adding_Trackbars_Tutorial_Trackbar.png)
 
@@ -24,26 +24,73 @@ Code
 
 Let's modify the program made in the tutorial @ref tutorial_adding_images. We will let the user enter the
 \f$\alpha\f$ value by using the Trackbar.
+
+@add_toggle_cpp
 This tutorial code's is shown lines below. You can also download it from
 [here](https://github.com/opencv/opencv/tree/master/samples/cpp/tutorial_code/HighGUI/AddingImagesTrackbar.cpp)
 @include cpp/tutorial_code/HighGUI/AddingImagesTrackbar.cpp
+@end_toggle
+
+@add_toggle_java
+This tutorial code's is shown lines below. You can also download it from
+[here](https://github.com/opencv/opencv/tree/master/samples/java/tutorial_code/highgui/trackbar/AddingImagesTrackbar.java)
+@include java/tutorial_code/highgui/trackbar/AddingImagesTrackbar.java
+@end_toggle
+
+@add_toggle_python
+This tutorial code's is shown lines below. You can also download it from
+[here](https://github.com/opencv/opencv/tree/master/samples/python/tutorial_code/highgui/trackbar/AddingImagesTrackbar.py)
+@include python/tutorial_code/highgui/trackbar/AddingImagesTrackbar.py
+@end_toggle
 
 Explanation
 -----------
 
 We only analyze the code that is related to Trackbar:
 
--#  First, we load two images, which are going to be blended.
-    @snippet cpp/tutorial_code/HighGUI/AddingImagesTrackbar.cpp load
+-  First, we load two images, which are going to be blended.
+
+@add_toggle_cpp
+@snippet cpp/tutorial_code/HighGUI/AddingImagesTrackbar.cpp load
+@end_toggle
+
+@add_toggle_java
+@snippet java/tutorial_code/highgui/trackbar/AddingImagesTrackbar.java load
+@end_toggle
+
+@add_toggle_python
+@snippet python/tutorial_code/highgui/trackbar/AddingImagesTrackbar.py load
+@end_toggle
 
--#  To create a trackbar, first we have to create the window in which it is going to be located. So:
-    @snippet cpp/tutorial_code/HighGUI/AddingImagesTrackbar.cpp window
+-  To create a trackbar, first we have to create the window in which it is going to be located. So:
 
--#  Now we can create the Trackbar:
-    @snippet cpp/tutorial_code/HighGUI/AddingImagesTrackbar.cpp create_trackbar
+@add_toggle_cpp
+@snippet cpp/tutorial_code/HighGUI/AddingImagesTrackbar.cpp window
+@end_toggle
 
-    Note the following:
+@add_toggle_java
+@snippet java/tutorial_code/highgui/trackbar/AddingImagesTrackbar.java window
+@end_toggle
 
+@add_toggle_python
+@snippet python/tutorial_code/highgui/trackbar/AddingImagesTrackbar.py window
+@end_toggle
+
+-  Now we can create the Trackbar:
+
+@add_toggle_cpp
+@snippet cpp/tutorial_code/HighGUI/AddingImagesTrackbar.cpp create_trackbar
+@end_toggle
+
+@add_toggle_java
+@snippet java/tutorial_code/highgui/trackbar/AddingImagesTrackbar.java create_trackbar
+@end_toggle
+
+@add_toggle_python
+@snippet python/tutorial_code/highgui/trackbar/AddingImagesTrackbar.py create_trackbar
+@end_toggle
+
+Note the following (C++ code):
     -   Our Trackbar has a label **TrackbarName**
     -   The Trackbar is located in the window named **Linear Blend**
     -   The Trackbar values will be in the range from \f$0\f$ to **alpha_slider_max** (the minimum
@@ -51,10 +98,21 @@ We only analyze the code that is related to Trackbar:
     -   The numerical value of Trackbar is stored in **alpha_slider**
     -   Whenever the user moves the Trackbar, the callback function **on_trackbar** is called
 
--#  Finally, we have to define the callback function **on_trackbar**
-    @snippet cpp/tutorial_code/HighGUI/AddingImagesTrackbar.cpp on_trackbar
+Finally, we have to define the callback function **on_trackbar** for C++ and Python code, using an anonymous inner class listener in Java
+
+@add_toggle_cpp
+@snippet cpp/tutorial_code/HighGUI/AddingImagesTrackbar.cpp on_trackbar
+@end_toggle
+
+@add_toggle_java
+@snippet java/tutorial_code/highgui/trackbar/AddingImagesTrackbar.java on_trackbar
+@end_toggle
+
+@add_toggle_python
+@snippet python/tutorial_code/highgui/trackbar/AddingImagesTrackbar.py on_trackbar
+@end_toggle
 
-    Note that:
+Note that (C++ code):
     -   We use the value of **alpha_slider** (integer) to get a double value for **alpha**.
     -   **alpha_slider** is updated each time the trackbar is displaced by the user.
     -   We define *src1*, *src2*, *dist*, *alpha*, *alpha_slider* and *beta* as global variables,
diff --git a/doc/tutorials/imgproc/erosion_dilatation/erosion_dilatation.markdown b/doc/tutorials/imgproc/erosion_dilatation/erosion_dilatation.markdown
index 515e6b26cb..8afcd2dea8 100644
--- a/doc/tutorials/imgproc/erosion_dilatation/erosion_dilatation.markdown
+++ b/doc/tutorials/imgproc/erosion_dilatation/erosion_dilatation.markdown
@@ -11,9 +11,6 @@ In this tutorial you will learn how to:
     -   @ref cv::erode
     -   @ref cv::dilate
 
-Interesting fact
------------
-
 @note The explanation below belongs to the book **Learning OpenCV** by Bradski and Kaehler.
 
 Morphological Operations
@@ -38,19 +35,14 @@ Morphological Operations
 -   As the kernel \f$B\f$ is scanned over the image, we compute the maximal pixel value overlapped by
     \f$B\f$ and replace the image pixel in the anchor point position with that maximal value. As you can
     deduce, this maximizing operation causes bright regions within an image to "grow" (therefore the
-    name *dilation*). Take the above image as an example. Applying dilation we can get:
-
-    ![](images/Morphology_1_Tutorial_Theory_Dilation.png)
+    name *dilation*).
+-   The dilatation operation is: \f$\texttt{dst} (x,y) =  \max _{(x',y'):  \, \texttt{element} (x',y') \ne0 } \texttt{src} (x+x',y+y')\f$
 
-The background (bright) dilates around the black regions of the letter.
+-   Take the above image as an example. Applying dilation we can get:
 
-To better grasp the idea and avoid possible confusion, in this other example we have inverted the original
-image such as the object in white is now the letter. We have performed two dilatations with a rectangular
-structuring element of size `3x3`.
-
-![Left image: original image inverted, right image: resulting dilatation](images/Morphology_1_Tutorial_Theory_Dilatation_2.png)
+    ![](images/Morphology_1_Tutorial_Theory_Dilation.png)
 
-The dilatation makes the object in white bigger.
+-   The bright area of the letter dilates around the black regions of the background.
 
 ### Erosion
 
@@ -58,31 +50,39 @@ The dilatation makes the object in white bigger.
     area of given kernel.
 -   As the kernel \f$B\f$ is scanned over the image, we compute the minimal pixel value overlapped by
     \f$B\f$ and replace the image pixel under the anchor point with that minimal value.
+-   The erosion operation is: \f$\texttt{dst} (x,y) =  \min _{(x',y'):  \, \texttt{element} (x',y') \ne0 } \texttt{src} (x+x',y+y')\f$
 -   Analagously to the example for dilation, we can apply the erosion operator to the original image
-    (shown above). You can see in the result below that the bright areas of the image (the
-    background, apparently), get thinner, whereas the dark zones (the "writing") gets bigger.
+    (shown above). You can see in the result below that the bright areas of the image get thinner,
+    whereas the dark zones gets bigger.
 
     ![](images/Morphology_1_Tutorial_Theory_Erosion.png)
 
-In similar manner, the corresponding image results by applying erosion operation on the inverted original image (two erosions
-with a rectangular structuring element of size `3x3`):
-
-![Left image: original image inverted, right image: resulting erosion](images/Morphology_1_Tutorial_Theory_Erosion_2.png)
-
-The erosion makes the object in white smaller.
-
 Code
 ----
 
+@add_toggle_cpp
 This tutorial's code is shown below. You can also download it
 [here](https://github.com/opencv/opencv/tree/master/samples/cpp/tutorial_code/ImgProc/Morphology_1.cpp)
 @include samples/cpp/tutorial_code/ImgProc/Morphology_1.cpp
+@end_toggle
+
+@add_toggle_java
+This tutorial's code is shown below. You can also download it
+[here](https://github.com/opencv/opencv/tree/master/samples/java/tutorial_code/ImgProc/erosion_dilatation/MorphologyDemo1.java)
+@include samples/java/tutorial_code/ImgProc/erosion_dilatation/MorphologyDemo1.java
+@end_toggle
+
+@add_toggle_python
+This tutorial's code is shown below. You can also download it
+[here](https://github.com/opencv/opencv/tree/master/samples/python/tutorial_code/imgProc/erosion_dilatation/morphology_1.py)
+@include samples/python/tutorial_code/imgProc/erosion_dilatation/morphology_1.py
+@end_toggle
 
 Explanation
 -----------
 
 -#  Most of the material shown here is trivial (if you have any doubt, please refer to the tutorials in
-    previous sections). Let's check the general structure of the program:
+    previous sections). Let's check the general structure of the C++ program:
 
     -   Load an image (can be BGR or grayscale)
     -   Create two windows (one for dilation output, the other for erosion)
diff --git a/doc/tutorials/imgproc/erosion_dilatation/images/Morphology_1_Tutorial_Theory_Dilatation_2.png b/doc/tutorials/imgproc/erosion_dilatation/images/Morphology_1_Tutorial_Theory_Dilatation_2.png
deleted file mode 100644
index bdca7c6233..0000000000
Binary files a/doc/tutorials/imgproc/erosion_dilatation/images/Morphology_1_Tutorial_Theory_Dilatation_2.png and /dev/null differ
diff --git a/doc/tutorials/imgproc/erosion_dilatation/images/Morphology_1_Tutorial_Theory_Dilation.png b/doc/tutorials/imgproc/erosion_dilatation/images/Morphology_1_Tutorial_Theory_Dilation.png
index dae930ec1e..4f0df79ed9 100644
Binary files a/doc/tutorials/imgproc/erosion_dilatation/images/Morphology_1_Tutorial_Theory_Dilation.png and b/doc/tutorials/imgproc/erosion_dilatation/images/Morphology_1_Tutorial_Theory_Dilation.png differ
diff --git a/doc/tutorials/imgproc/erosion_dilatation/images/Morphology_1_Tutorial_Theory_Erosion.png b/doc/tutorials/imgproc/erosion_dilatation/images/Morphology_1_Tutorial_Theory_Erosion.png
index 755561023f..18e9970137 100644
Binary files a/doc/tutorials/imgproc/erosion_dilatation/images/Morphology_1_Tutorial_Theory_Erosion.png and b/doc/tutorials/imgproc/erosion_dilatation/images/Morphology_1_Tutorial_Theory_Erosion.png differ
diff --git a/doc/tutorials/imgproc/erosion_dilatation/images/Morphology_1_Tutorial_Theory_Erosion_2.png b/doc/tutorials/imgproc/erosion_dilatation/images/Morphology_1_Tutorial_Theory_Erosion_2.png
deleted file mode 100644
index 5e666eef36..0000000000
Binary files a/doc/tutorials/imgproc/erosion_dilatation/images/Morphology_1_Tutorial_Theory_Erosion_2.png and /dev/null differ
diff --git a/doc/tutorials/imgproc/erosion_dilatation/images/Morphology_1_Tutorial_Theory_Original_Image.png b/doc/tutorials/imgproc/erosion_dilatation/images/Morphology_1_Tutorial_Theory_Original_Image.png
index d7e8a9a429..969294c7c6 100644
Binary files a/doc/tutorials/imgproc/erosion_dilatation/images/Morphology_1_Tutorial_Theory_Original_Image.png and b/doc/tutorials/imgproc/erosion_dilatation/images/Morphology_1_Tutorial_Theory_Original_Image.png differ
diff --git a/doc/tutorials/imgproc/opening_closing_hats/images/Morphology_2_Tutorial_Theory_BlackHat.png b/doc/tutorials/imgproc/opening_closing_hats/images/Morphology_2_Tutorial_Theory_BlackHat.png
index ce6ff35550..969bc20f2d 100644
Binary files a/doc/tutorials/imgproc/opening_closing_hats/images/Morphology_2_Tutorial_Theory_BlackHat.png and b/doc/tutorials/imgproc/opening_closing_hats/images/Morphology_2_Tutorial_Theory_BlackHat.png differ
diff --git a/doc/tutorials/imgproc/opening_closing_hats/images/Morphology_2_Tutorial_Theory_Closing.png b/doc/tutorials/imgproc/opening_closing_hats/images/Morphology_2_Tutorial_Theory_Closing.png
index f112bfef11..a034ac2496 100644
Binary files a/doc/tutorials/imgproc/opening_closing_hats/images/Morphology_2_Tutorial_Theory_Closing.png and b/doc/tutorials/imgproc/opening_closing_hats/images/Morphology_2_Tutorial_Theory_Closing.png differ
diff --git a/doc/tutorials/imgproc/opening_closing_hats/images/Morphology_2_Tutorial_Theory_Closing_2.png b/doc/tutorials/imgproc/opening_closing_hats/images/Morphology_2_Tutorial_Theory_Closing_2.png
deleted file mode 100644
index 57b790583f..0000000000
Binary files a/doc/tutorials/imgproc/opening_closing_hats/images/Morphology_2_Tutorial_Theory_Closing_2.png and /dev/null differ
diff --git a/doc/tutorials/imgproc/opening_closing_hats/images/Morphology_2_Tutorial_Theory_Gradient.png b/doc/tutorials/imgproc/opening_closing_hats/images/Morphology_2_Tutorial_Theory_Gradient.png
index 5d8077f0d2..44590016a0 100644
Binary files a/doc/tutorials/imgproc/opening_closing_hats/images/Morphology_2_Tutorial_Theory_Gradient.png and b/doc/tutorials/imgproc/opening_closing_hats/images/Morphology_2_Tutorial_Theory_Gradient.png differ
diff --git a/doc/tutorials/imgproc/opening_closing_hats/images/Morphology_2_Tutorial_Theory_Opening.png b/doc/tutorials/imgproc/opening_closing_hats/images/Morphology_2_Tutorial_Theory_Opening.png
index 7b8b0850fd..b63e99ed9d 100644
Binary files a/doc/tutorials/imgproc/opening_closing_hats/images/Morphology_2_Tutorial_Theory_Opening.png and b/doc/tutorials/imgproc/opening_closing_hats/images/Morphology_2_Tutorial_Theory_Opening.png differ
diff --git a/doc/tutorials/imgproc/opening_closing_hats/images/Morphology_2_Tutorial_Theory_Opening_2.png b/doc/tutorials/imgproc/opening_closing_hats/images/Morphology_2_Tutorial_Theory_Opening_2.png
deleted file mode 100644
index 973e13a2d2..0000000000
Binary files a/doc/tutorials/imgproc/opening_closing_hats/images/Morphology_2_Tutorial_Theory_Opening_2.png and /dev/null differ
diff --git a/doc/tutorials/imgproc/opening_closing_hats/images/Morphology_2_Tutorial_Theory_TopHat.png b/doc/tutorials/imgproc/opening_closing_hats/images/Morphology_2_Tutorial_Theory_TopHat.png
index ca22789b03..18f5084ca5 100644
Binary files a/doc/tutorials/imgproc/opening_closing_hats/images/Morphology_2_Tutorial_Theory_TopHat.png and b/doc/tutorials/imgproc/opening_closing_hats/images/Morphology_2_Tutorial_Theory_TopHat.png differ
diff --git a/doc/tutorials/imgproc/opening_closing_hats/opening_closing_hats.markdown b/doc/tutorials/imgproc/opening_closing_hats/opening_closing_hats.markdown
index aaf57b1732..f5042907ef 100644
--- a/doc/tutorials/imgproc/opening_closing_hats/opening_closing_hats.markdown
+++ b/doc/tutorials/imgproc/opening_closing_hats/opening_closing_hats.markdown
@@ -36,15 +36,10 @@ discuss briefly 5 operations offered by OpenCV:
     foreground)
 -   For instance, check out the example below. The image at the left is the original and the image
     at the right is the result after applying the opening transformation. We can observe that the
-    small spaces in the corners of the letter tend to disappear.
+    small dots have disappeared.
 
     ![](images/Morphology_2_Tutorial_Theory_Opening.png)
 
-For the sake of clarity, we have performed the opening operation (`7x7` rectangular structuring element)
-on the same original image but inverted such as the object in white is now the letter.
-
-![Left image: original image inverted, right image: resulting opening](images/Morphology_2_Tutorial_Theory_Opening_2.png)
-
 ### Closing
 
 -   It is obtained by the dilation of an image followed by an erosion.
@@ -55,10 +50,6 @@ on the same original image but inverted such as the object in white is now the l
 
     ![](images/Morphology_2_Tutorial_Theory_Closing.png)
 
-On the inverted image, we have performed the closing operation (`7x7` rectangular structuring element):
-
-![Left image: original image inverted, right image: resulting closing](images/Morphology_2_Tutorial_Theory_Closing_2.png)
-
 ### Morphological Gradient
 
 -   It is the difference between the dilation and the erosion of an image.
@@ -88,14 +79,28 @@ On the inverted image, we have performed the closing operation (`7x7` rectangula
 Code
 ----
 
-This tutorial code's is shown lines below. You can also download it from
+@add_toggle_cpp
+This tutorial's code is shown below. You can also download it
 [here](https://github.com/opencv/opencv/tree/master/samples/cpp/tutorial_code/ImgProc/Morphology_2.cpp)
 @include cpp/tutorial_code/ImgProc/Morphology_2.cpp
+@end_toggle
+
+@add_toggle_java
+This tutorial's code is shown below. You can also download it
+[here](https://github.com/opencv/opencv/tree/master/samples/java/tutorial_code/ImgProc/opening_closing_hats/MorphologyDemo2.java)
+@include java/tutorial_code/ImgProc/opening_closing_hats/MorphologyDemo2.java
+@end_toggle
+
+@add_toggle_python
+This tutorial's code is shown below. You can also download it
+[here](https://github.com/opencv/opencv/tree/master/samples/python/tutorial_code/imgProc/opening_closing_hats/morphology_2.py)
+@include python/tutorial_code/imgProc/opening_closing_hats/morphology_2.py
+@end_toggle
 
 Explanation
 -----------
 
--#  Let's check the general structure of the program:
+-#  Let's check the general structure of the C++ program:
     -   Load an image
     -   Create a window to display results of the Morphological operations
     -   Create three Trackbars for the user to enter parameters:
@@ -139,8 +144,8 @@ Explanation
 Results
 -------
 
--   After compiling the code above we can execute it giving an image path as an argument. For this
-    tutorial we use as input the image: **baboon.png**:
+-   After compiling the code above we can execute it giving an image path as an argument. Results using
+    the image: **baboon.png**:
 
     ![](images/Morphology_2_Tutorial_Original_Image.jpg)
 
diff --git a/modules/core/include/opencv2/core/cuda.hpp b/modules/core/include/opencv2/core/cuda.hpp
index 0ee9b74273..820aba71ec 100644
--- a/modules/core/include/opencv2/core/cuda.hpp
+++ b/modules/core/include/opencv2/core/cuda.hpp
@@ -305,6 +305,9 @@ public:
     //! returns true if GpuMat data is NULL
     bool empty() const;
 
+    //! internal use method: updates the continuity flag
+    void updateContinuityFlag();
+
     /*! includes several bit-fields:
     - the magic signature
     - continuity flag
diff --git a/modules/core/include/opencv2/core/mat.hpp b/modules/core/include/opencv2/core/mat.hpp
index 4bba3ad0e6..4083b0196b 100644
--- a/modules/core/include/opencv2/core/mat.hpp
+++ b/modules/core/include/opencv2/core/mat.hpp
@@ -2084,6 +2084,9 @@ public:
     static MatAllocator* getDefaultAllocator();
     static void setDefaultAllocator(MatAllocator* allocator);
 
+    //! internal use method: updates the continuity flag
+    void updateContinuityFlag();
+
     //! interaction with UMat
     UMatData* u;
 
@@ -2551,6 +2554,9 @@ public:
     //! and the standard allocator
     static MatAllocator* getStdAllocator();
 
+    //! internal use method: updates the continuity flag
+    void updateContinuityFlag();
+
     // black-box container of UMat data
     UMatData* u;
 
diff --git a/modules/core/include/opencv2/core/mat.inl.hpp b/modules/core/include/opencv2/core/mat.inl.hpp
index 49db94a951..fad989c0b2 100644
--- a/modules/core/include/opencv2/core/mat.inl.hpp
+++ b/modules/core/include/opencv2/core/mat.inl.hpp
@@ -495,24 +495,20 @@ Mat::Mat(int _rows, int _cols, int _type, void* _data, size_t _step)
     if( _step == AUTO_STEP )
     {
         _step = minstep;
-        flags |= CONTINUOUS_FLAG;
     }
     else
     {
         CV_DbgAssert( _step >= minstep );
-
         if (_step % esz1 != 0)
         {
             CV_Error(Error::BadStep, "Step must be a multiple of esz1");
         }
-
-        if (_step == minstep || rows == 1)
-            flags |= CONTINUOUS_FLAG;
     }
     step[0] = _step;
     step[1] = esz;
     datalimit = datastart + _step * rows;
     dataend = datalimit - _step + minstep;
+    updateContinuityFlag();
 }
 
 inline
@@ -528,7 +524,6 @@ Mat::Mat(Size _sz, int _type, void* _data, size_t _step)
     if( _step == AUTO_STEP )
     {
         _step = minstep;
-        flags |= CONTINUOUS_FLAG;
     }
     else
     {
@@ -538,14 +533,12 @@ Mat::Mat(Size _sz, int _type, void* _data, size_t _step)
         {
             CV_Error(Error::BadStep, "Step must be a multiple of esz1");
         }
-
-        if (_step == minstep || rows == 1)
-            flags |= CONTINUOUS_FLAG;
     }
     step[0] = _step;
     step[1] = esz;
     datalimit = datastart + _step*rows;
     dataend = datalimit - _step + minstep;
+    updateContinuityFlag();
 }
 
 template<typename _Tp> inline
diff --git a/modules/core/include/opencv2/core/private.cuda.hpp b/modules/core/include/opencv2/core/private.cuda.hpp
index 10948e5408..499c19bcb0 100644
--- a/modules/core/include/opencv2/core/private.cuda.hpp
+++ b/modules/core/include/opencv2/core/private.cuda.hpp
@@ -152,7 +152,7 @@ namespace cv { namespace cuda
 
         inline ~NppStreamHandler()
         {
-            nppSetStream(oldStream);
+            cudaStreamSynchronize(oldStream);
         }
 
     private:
diff --git a/modules/core/misc/java/test/MatTest.java b/modules/core/misc/java/test/MatTest.java
index 7d0731aa71..a497bba1f2 100644
--- a/modules/core/misc/java/test/MatTest.java
+++ b/modules/core/misc/java/test/MatTest.java
@@ -489,7 +489,7 @@ public class MatTest extends OpenCVTestCase {
     public void testIsContinuous() {
         assertTrue(gray0.isContinuous());
 
-        Mat subMat = gray0.submat(0, 0, gray0.rows() / 2, gray0.cols() / 2);
+        Mat subMat = gray0.submat(0, gray0.rows() / 2, 0, gray0.cols() / 2);
         assertFalse(subMat.isContinuous());
     }
 
@@ -937,7 +937,7 @@ public class MatTest extends OpenCVTestCase {
     }
 
     public void testSubmatRect() {
-        Mat submat = gray255.submat(new Rect(5, gray255.rows() / 2, 5, gray255.cols() / 2));
+        Mat submat = gray255.submat(new Rect(5, 5, gray255.cols() / 2, gray255.rows() / 2));
         assertTrue(submat.isSubmatrix());
         assertFalse(submat.isContinuous());
 
diff --git a/modules/core/src/cuda_gpu_mat.cpp b/modules/core/src/cuda_gpu_mat.cpp
index 9514ec2037..0624032460 100644
--- a/modules/core/src/cuda_gpu_mat.cpp
+++ b/modules/core/src/cuda_gpu_mat.cpp
@@ -46,6 +46,13 @@
 using namespace cv;
 using namespace cv::cuda;
 
+void cv::cuda::GpuMat::updateContinuityFlag()
+{
+    int sz[] = { rows, cols };
+    size_t steps[] = { step, elemSize() };
+    flags = cv::updateContinuityFlag(flags, 2, sz, steps);
+}
+
 cv::cuda::GpuMat::GpuMat(int rows_, int cols_, int type_, void* data_, size_t step_) :
     flags(Mat::MAGIC_VAL + (type_ & Mat::TYPE_MASK)), rows(rows_), cols(cols_),
     step(step_), data((uchar*)data_), refcount(0),
@@ -57,7 +64,6 @@ cv::cuda::GpuMat::GpuMat(int rows_, int cols_, int type_, void* data_, size_t st
     if (step == Mat::AUTO_STEP)
     {
         step = minstep;
-        flags |= Mat::CONTINUOUS_FLAG;
     }
     else
     {
@@ -65,11 +71,10 @@ cv::cuda::GpuMat::GpuMat(int rows_, int cols_, int type_, void* data_, size_t st
             step = minstep;
 
         CV_DbgAssert( step >= minstep );
-
-        flags |= step == minstep ? Mat::CONTINUOUS_FLAG : 0;
     }
 
     dataend += step * (rows - 1) + minstep;
+    updateContinuityFlag();
 }
 
 cv::cuda::GpuMat::GpuMat(Size size_, int type_, void* data_, size_t step_) :
@@ -83,7 +88,6 @@ cv::cuda::GpuMat::GpuMat(Size size_, int type_, void* data_, size_t step_) :
     if (step == Mat::AUTO_STEP)
     {
         step = minstep;
-        flags |= Mat::CONTINUOUS_FLAG;
     }
     else
     {
@@ -91,11 +95,10 @@ cv::cuda::GpuMat::GpuMat(Size size_, int type_, void* data_, size_t step_) :
             step = minstep;
 
         CV_DbgAssert( step >= minstep );
-
-        flags |= step == minstep ? Mat::CONTINUOUS_FLAG : 0;
     }
 
     dataend += step * (rows - 1) + minstep;
+    updateContinuityFlag();
 }
 
 cv::cuda::GpuMat::GpuMat(const GpuMat& m, Range rowRange_, Range colRange_)
@@ -127,17 +130,15 @@ cv::cuda::GpuMat::GpuMat(const GpuMat& m, Range rowRange_, Range colRange_)
 
         cols = colRange_.size();
         data += colRange_.start*elemSize();
-        flags &= cols < m.cols ? ~Mat::CONTINUOUS_FLAG : -1;
     }
 
-    if (rows == 1)
-        flags |= Mat::CONTINUOUS_FLAG;
-
     if (refcount)
         CV_XADD(refcount, 1);
 
     if (rows <= 0 || cols <= 0)
         rows = cols = 0;
+
+    updateContinuityFlag();
 }
 
 cv::cuda::GpuMat::GpuMat(const GpuMat& m, Rect roi) :
@@ -146,16 +147,19 @@ cv::cuda::GpuMat::GpuMat(const GpuMat& m, Rect roi) :
     datastart(m.datastart), dataend(m.dataend),
     allocator(m.allocator)
 {
-    flags &= roi.width < m.cols ? ~Mat::CONTINUOUS_FLAG : -1;
     data += roi.x * elemSize();
 
-    CV_Assert( 0 <= roi.x && 0 <= roi.width && roi.x + roi.width <= m.cols && 0 <= roi.y && 0 <= roi.height && roi.y + roi.height <= m.rows );
+    CV_Assert( 0 <= roi.x && 0 <= roi.width &&
+               roi.x + roi.width <= m.cols &&
+               0 <= roi.y && 0 <= roi.height &&
+               roi.y + roi.height <= m.rows );
 
     if (refcount)
         CV_XADD(refcount, 1);
 
     if (rows <= 0 || cols <= 0)
         rows = cols = 0;
+    updateContinuityFlag();
 }
 
 GpuMat cv::cuda::GpuMat::reshape(int new_cn, int new_rows) const
@@ -245,11 +249,7 @@ GpuMat& cv::cuda::GpuMat::adjustROI(int dtop, int dbottom, int dleft, int dright
     rows = row2 - row1;
     cols = col2 - col1;
 
-    if (esz * cols == step || rows == 1)
-        flags |= Mat::CONTINUOUS_FLAG;
-    else
-        flags &= ~Mat::CONTINUOUS_FLAG;
-
+    updateContinuityFlag();
     return *this;
 }
 
diff --git a/modules/core/src/cuda_host_mem.cpp b/modules/core/src/cuda_host_mem.cpp
index 37b2314c96..af2fc05fe0 100644
--- a/modules/core/src/cuda_host_mem.cpp
+++ b/modules/core/src/cuda_host_mem.cpp
@@ -201,10 +201,13 @@ void cv::cuda::HostMem::create(int rows_, int cols_, int type_)
 
     if (rows_ > 0 && cols_ > 0)
     {
-        flags = Mat::MAGIC_VAL + Mat::CONTINUOUS_FLAG + type_;
+        flags = Mat::MAGIC_VAL + type_;
         rows = rows_;
         cols = cols_;
         step = elemSize() * cols;
+        int sz[] = { rows, cols };
+        size_t steps[] = { step, CV_ELEM_SIZE(type_) };
+        flags = updateContinuityFlag(flags, 2, sz, steps);
 
         if (alloc_type == SHARED)
         {
diff --git a/modules/core/src/cuda_stream.cpp b/modules/core/src/cuda_stream.cpp
index 67cb6ad954..d06ae67af9 100644
--- a/modules/core/src/cuda_stream.cpp
+++ b/modules/core/src/cuda_stream.cpp
@@ -594,10 +594,11 @@ namespace
 
     StackAllocator::~StackAllocator()
     {
-        cudaStreamSynchronize(stream_);
-
         if (memStack_ != 0)
+        {
+            cudaStreamSynchronize(stream_);
             memStack_->pool->returnMemStack(memStack_);
+        }
     }
 
     size_t alignUp(size_t what, size_t alignment)
diff --git a/modules/core/src/matrix.cpp b/modules/core/src/matrix.cpp
index d9d7ebef97..a003fd4b01 100644
--- a/modules/core/src/matrix.cpp
+++ b/modules/core/src/matrix.cpp
@@ -262,31 +262,36 @@ void setSize( Mat& m, int _dims, const int* _sz, const size_t* _steps, bool auto
     }
 }
 
-static void updateContinuityFlag(Mat& m)
+int updateContinuityFlag(int flags, int dims, const int* size, const size_t* step)
 {
     int i, j;
-    for( i = 0; i < m.dims; i++ )
+    for( i = 0; i < dims; i++ )
     {
-        if( m.size[i] > 1 )
+        if( size[i] > 1 )
             break;
     }
 
-    for( j = m.dims-1; j > i; j-- )
+    uint64 t = (uint64)size[std::min(i, dims-1)]*CV_MAT_CN(flags);
+    for( j = dims-1; j > i; j-- )
     {
-        if( m.step[j]*m.size[j] < m.step[j-1] )
+        t *= size[j];
+        if( step[j]*size[j] < step[j-1] )
             break;
     }
 
-    uint64 t = (uint64)m.step[0]*m.size[0];
-    if( j <= i && t == (size_t)t )
-        m.flags |= Mat::CONTINUOUS_FLAG;
-    else
-        m.flags &= ~Mat::CONTINUOUS_FLAG;
+    if( j <= i && t == (uint64)(int)t )
+        return flags | Mat::CONTINUOUS_FLAG;
+    return flags & ~Mat::CONTINUOUS_FLAG;
+}
+
+void Mat::updateContinuityFlag()
+{
+    flags = cv::updateContinuityFlag(flags, dims, size.p, step.p);
 }
 
 void finalizeHdr(Mat& m)
 {
-    updateContinuityFlag(m);
+    m.updateContinuityFlag();
     int d = m.dims;
     if( d > 2 )
         m.rows = m.cols = -1;
@@ -427,7 +432,6 @@ Mat::Mat(const Mat& m, const Range& _rowRange, const Range& _colRange)
                        && _colRange.end <= m.cols );
             cols = _colRange.size();
             data += _colRange.start*elemSize();
-            flags &= cols < m.cols ? ~CONTINUOUS_FLAG : -1;
             flags |= SUBMATRIX_FLAG;
         }
     }
@@ -437,8 +441,7 @@ Mat::Mat(const Mat& m, const Range& _rowRange, const Range& _colRange)
         CV_RETHROW();
     }
 
-    if( rows == 1 )
-        flags |= CONTINUOUS_FLAG;
+    updateContinuityFlag();
 
     if( rows <= 0 || cols <= 0 )
     {
@@ -455,8 +458,6 @@ Mat::Mat(const Mat& m, const Rect& roi)
     allocator(m.allocator), u(m.u), size(&rows)
 {
     CV_Assert( m.dims <= 2 );
-    flags &= roi.width < m.cols ? ~CONTINUOUS_FLAG : -1;
-    flags |= roi.height == 1 ? CONTINUOUS_FLAG : 0;
 
     size_t esz = CV_ELEM_SIZE(flags);
     data += roi.x*esz;
@@ -468,6 +469,7 @@ Mat::Mat(const Mat& m, const Rect& roi)
         flags |= SUBMATRIX_FLAG;
 
     step[0] = m.step[0]; step[1] = esz;
+    updateContinuityFlag();
 
     if( rows <= 0 || cols <= 0 )
     {
@@ -522,7 +524,7 @@ Mat::Mat(const Mat& m, const Range* ranges)
             flags |= SUBMATRIX_FLAG;
         }
     }
-    updateContinuityFlag(*this);
+    updateContinuityFlag();
 }
 
 Mat::Mat(const Mat& m, const std::vector<Range>& ranges)
@@ -548,7 +550,7 @@ Mat::Mat(const Mat& m, const std::vector<Range>& ranges)
             flags |= SUBMATRIX_FLAG;
         }
     }
-    updateContinuityFlag(*this);
+    updateContinuityFlag();
 }
 
 
@@ -575,10 +577,7 @@ Mat Mat::diag(int d) const
     m.size[1] = m.cols = 1;
     m.step[0] += (len > 1 ? esz : 0);
 
-    if( m.rows > 1 )
-        m.flags &= ~CONTINUOUS_FLAG;
-    else
-        m.flags |= CONTINUOUS_FLAG;
+    m.updateContinuityFlag();
 
     if( size() != Size(1,1) )
         m.flags |= SUBMATRIX_FLAG;
@@ -597,13 +596,6 @@ void Mat::pop_back(size_t nelems)
     {
         size.p[0] -= (int)nelems;
         dataend -= nelems*step.p[0];
-        /*if( size.p[0] <= 1 )
-        {
-            if( dims <= 2 )
-                flags |= CONTINUOUS_FLAG;
-            else
-                updateContinuityFlag(*this);
-        }*/
     }
 }
 
@@ -618,7 +610,10 @@ void Mat::push_back_(const void* elem)
     memcpy(data + r*step.p[0], elem, esz);
     size.p[0] = r + 1;
     dataend += step.p[0];
-    if( esz < step.p[0] )
+    uint64 tsz = size.p[0];
+    for( int i = 1; i < dims; i++ )
+        tsz *= size.p[i];
+    if( esz < step.p[0] || tsz != (uint64)(int)tsz )
         flags &= ~CONTINUOUS_FLAG;
 }
 
@@ -792,10 +787,7 @@ Mat& Mat::adjustROI( int dtop, int dbottom, int dleft, int dright )
     data += (row1 - ofs.y)*step + (col1 - ofs.x)*esz;
     rows = row2 - row1; cols = col2 - col1;
     size.p[0] = rows; size.p[1] = cols;
-    if( esz*cols == step[0] || rows == 1 )
-        flags |= CONTINUOUS_FLAG;
-    else
-        flags &= ~CONTINUOUS_FLAG;
+    updateContinuityFlag();
     return *this;
 }
 
diff --git a/modules/core/src/matrix_c.cpp b/modules/core/src/matrix_c.cpp
index 28583cd05c..6d299f7531 100644
--- a/modules/core/src/matrix_c.cpp
+++ b/modules/core/src/matrix_c.cpp
@@ -120,8 +120,8 @@ static Mat iplImageToMat(const IplImage* img, bool copyData)
     }
     m.datalimit = m.datastart + m.step.p[0]*m.rows;
     m.dataend = m.datastart + m.step.p[0]*(m.rows-1) + esz*m.cols;
-    m.flags |= (m.cols*esz == m.step.p[0] || m.rows == 1 ? Mat::CONTINUOUS_FLAG : 0);
     m.step[1] = esz;
+    m.updateContinuityFlag();
 
     if( copyData )
     {
diff --git a/modules/core/src/ocl.cpp b/modules/core/src/ocl.cpp
index 142f90aeec..59b649feaa 100644
--- a/modules/core/src/ocl.cpp
+++ b/modules/core/src/ocl.cpp
@@ -5681,8 +5681,6 @@ namespace cv {
 // three funcs below are implemented in umatrix.cpp
 void setSize( UMat& m, int _dims, const int* _sz, const size_t* _steps,
               bool autoSteps = false );
-
-void updateContinuityFlag(UMat& m);
 void finalizeHdr(UMat& m);
 
 } // namespace cv
diff --git a/modules/core/src/precomp.hpp b/modules/core/src/precomp.hpp
index 1e261785b2..0786743b43 100644
--- a/modules/core/src/precomp.hpp
+++ b/modules/core/src/precomp.hpp
@@ -193,6 +193,7 @@ inline Size getContinuousSize( const Mat& m1, const Mat& m2,
 
 void setSize( Mat& m, int _dims, const int* _sz, const size_t* _steps, bool autoSteps=false );
 void finalizeHdr(Mat& m);
+int updateContinuityFlag(int flags, int dims, const int* size, const size_t* step);
 
 struct NoVec
 {
diff --git a/modules/core/src/umatrix.cpp b/modules/core/src/umatrix.cpp
index 303b6bec12..f89f777b52 100644
--- a/modules/core/src/umatrix.cpp
+++ b/modules/core/src/umatrix.cpp
@@ -318,32 +318,15 @@ void setSize( UMat& m, int _dims, const int* _sz,
 }
 
 
-void updateContinuityFlag(UMat& m)
+void UMat::updateContinuityFlag()
 {
-    int i, j;
-    for( i = 0; i < m.dims; i++ )
-    {
-        if( m.size[i] > 1 )
-            break;
-    }
-
-    for( j = m.dims-1; j > i; j-- )
-    {
-        if( m.step[j]*m.size[j] < m.step[j-1] )
-            break;
-    }
-
-    uint64 total = (uint64)m.step[0]*m.size[0];
-    if( j <= i && total == (size_t)total )
-        m.flags |= UMat::CONTINUOUS_FLAG;
-    else
-        m.flags &= ~UMat::CONTINUOUS_FLAG;
+    flags = cv::updateContinuityFlag(flags, dims, size.p, step.p);
 }
 
 
 void finalizeHdr(UMat& m)
 {
-    updateContinuityFlag(m);
+    m.updateContinuityFlag();
     int d = m.dims;
     if( d > 2 )
         m.rows = m.cols = -1;
@@ -537,12 +520,10 @@ UMat::UMat(const UMat& m, const Range& _rowRange, const Range& _colRange)
         CV_Assert( 0 <= _colRange.start && _colRange.start <= _colRange.end && _colRange.end <= m.cols );
         cols = _colRange.size();
         offset += _colRange.start*elemSize();
-        flags &= cols < m.cols ? ~CONTINUOUS_FLAG : -1;
         flags |= SUBMATRIX_FLAG;
     }
 
-    if( rows == 1 )
-        flags |= CONTINUOUS_FLAG;
+    updateContinuityFlag();
 
     if( rows <= 0 || cols <= 0 )
     {
@@ -557,8 +538,6 @@ UMat::UMat(const UMat& m, const Rect& roi)
     allocator(m.allocator), usageFlags(m.usageFlags), u(m.u), offset(m.offset + roi.y*m.step[0]), size(&rows)
 {
     CV_Assert( m.dims <= 2 );
-    flags &= roi.width < m.cols ? ~CONTINUOUS_FLAG : -1;
-    flags |= roi.height == 1 ? CONTINUOUS_FLAG : 0;
 
     size_t esz = CV_ELEM_SIZE(flags);
     offset += roi.x*esz;
@@ -570,6 +549,7 @@ UMat::UMat(const UMat& m, const Rect& roi)
         flags |= SUBMATRIX_FLAG;
 
     step[0] = m.step[0]; step[1] = esz;
+    updateContinuityFlag();
 
     if( rows <= 0 || cols <= 0 )
     {
@@ -601,7 +581,7 @@ UMat::UMat(const UMat& m, const Range* ranges)
             flags |= SUBMATRIX_FLAG;
         }
     }
-    updateContinuityFlag(*this);
+    updateContinuityFlag();
 }
 
 UMat::UMat(const UMat& m, const std::vector<Range>& ranges)
@@ -626,7 +606,7 @@ UMat::UMat(const UMat& m, const std::vector<Range>& ranges)
             flags |= SUBMATRIX_FLAG;
         }
     }
-    updateContinuityFlag(*this);
+    updateContinuityFlag();
 }
 
 UMat UMat::diag(int d) const
@@ -652,10 +632,7 @@ UMat UMat::diag(int d) const
     m.size[1] = m.cols = 1;
     m.step[0] += (len > 1 ? esz : 0);
 
-    if( m.rows > 1 )
-        m.flags &= ~CONTINUOUS_FLAG;
-    else
-        m.flags |= CONTINUOUS_FLAG;
+    m.updateContinuityFlag();
 
     if( size() != Size(1,1) )
         m.flags |= SUBMATRIX_FLAG;
@@ -701,10 +678,7 @@ UMat& UMat::adjustROI( int dtop, int dbottom, int dleft, int dright )
     offset += (row1 - ofs.y)*step + (col1 - ofs.x)*esz;
     rows = row2 - row1; cols = col2 - col1;
     size.p[0] = rows; size.p[1] = cols;
-    if( esz*cols == step[0] || rows == 1 )
-        flags |= CONTINUOUS_FLAG;
-    else
-        flags &= ~CONTINUOUS_FLAG;
+    updateContinuityFlag();
     return *this;
 }
 
diff --git a/modules/core/test/test_io.cpp b/modules/core/test/test_io.cpp
index 6b39c41b3b..b4e3d10db6 100644
--- a/modules/core/test/test_io.cpp
+++ b/modules/core/test/test_io.cpp
@@ -522,33 +522,23 @@ protected:
 
 TEST(Core_InputOutput, misc) { CV_MiscIOTest test; test.safe_run(); }
 
-/*class CV_BigMatrixIOTest : public cvtest::BaseTest
+#if 0 // 4+ GB of data, 40+ GB of estimated result size, it is very slow
+BIGDATA_TEST(Core_InputOutput, huge)
 {
-public:
-    CV_BigMatrixIOTest() {}
-    ~CV_BigMatrixIOTest() {}
-protected:
-    void run(int)
+    RNG& rng = theRNG();
+    int N = 1000, M = 1200000;
+    std::cout << "Allocating..." << std::endl;
+    Mat mat(M, N, CV_32F);
+    std::cout << "Initializing..." << std::endl;
+    rng.fill(mat, RNG::UNIFORM, 0, 1);
+    std::cout << "Writing..." << std::endl;
     {
-        try
-        {
-            RNG& rng = theRNG();
-            int N = 1000, M = 1200000;
-            Mat mat(M, N, CV_32F);
-            rng.fill(mat, RNG::UNIFORM, 0, 1);
-            FileStorage fs(cv::tempfile(".xml"), FileStorage::WRITE);
-            fs << "mat" << mat;
-            fs.release();
-        }
-        catch(...)
-        {
-            ts->set_failed_test_info(cvtest::TS::FAIL_MISMATCH);
-        }
+        FileStorage fs(cv::tempfile(".xml"), FileStorage::WRITE);
+        fs << "mat" << mat;
+        fs.release();
     }
-};
-
-TEST(Core_InputOutput, huge) { CV_BigMatrixIOTest test; test.safe_run(); }
-*/
+}
+#endif
 
 TEST(Core_globbing, accuracy)
 {
diff --git a/modules/core/test/test_mat.cpp b/modules/core/test/test_mat.cpp
index 17bcd5524e..b43ab20016 100644
--- a/modules/core/test/test_mat.cpp
+++ b/modules/core/test/test_mat.cpp
@@ -1766,4 +1766,26 @@ TEST(Mat_, template_based_ptr)
     ASSERT_FLOAT_EQ(66.0f, *(mat.ptr<float>(idx)));
 }
 
+
+BIGDATA_TEST(Mat, push_back_regression_4158)  // memory usage: ~10.6 Gb
+{
+    Mat result;
+
+    Mat tail(100, 500000, CV_32FC2, Scalar(1, 2));
+
+    tail.copyTo(result);
+    for (int i = 1; i < 15; i++)
+    {
+        result.push_back(tail);
+        std::cout << "i = " << i << "  result = " << result.size() << "   used = " << (uint64)result.total()*result.elemSize()*(1.0 / (1 << 20)) << " Mb"
+            << "   allocated=" << (uint64)(result.datalimit - result.datastart)*(1.0 / (1 << 20)) << " Mb" << std::endl;
+    }
+    for (int i = 0; i < 15; i++)
+    {
+        Rect roi(0, tail.rows * i, tail.cols, tail.rows);
+        int nz = countNonZero(result(roi).reshape(1) == 2);
+        EXPECT_EQ(tail.total(), (size_t)nz) << "i=" << i;
+    }
+}
+
 }} // namespace
diff --git a/modules/cudaarithm/src/reductions.cpp b/modules/cudaarithm/src/reductions.cpp
index ce1bc232cf..4824a5c4da 100644
--- a/modules/cudaarithm/src/reductions.cpp
+++ b/modules/cudaarithm/src/reductions.cpp
@@ -137,12 +137,11 @@ void cv::cuda::meanStdDev(InputArray _src, OutputArray _dst, Stream& stream)
     if (!deviceSupports(FEATURE_SET_COMPUTE_13))
         CV_Error(cv::Error::StsNotImplemented, "Not sufficient compute capebility");
 
-    GpuMat src = getInputMat(_src, stream);
+    const GpuMat src = getInputMat(_src, stream);
 
     CV_Assert( src.type() == CV_8UC1 );
 
-    _dst.create(1, 2, CV_64FC1);
-    GpuMat dst = _dst.getGpuMat();
+    GpuMat dst = getOutputMat(_dst, 1, 2, CV_64FC1, stream);
 
     NppiSize sz;
     sz.width  = src.cols;
diff --git a/modules/dnn/include/opencv2/dnn/dnn.hpp b/modules/dnn/include/opencv2/dnn/dnn.hpp
index 7f8c7e7499..6ac2f1a7fe 100644
--- a/modules/dnn/include/opencv2/dnn/dnn.hpp
+++ b/modules/dnn/include/opencv2/dnn/dnn.hpp
@@ -826,6 +826,10 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
                                CV_OUT std::vector<int>& indices,
                                const float eta = 1.f, const int top_k = 0);
 
+    CV_EXPORTS void NMSBoxes(const std::vector<RotatedRect>& bboxes, const std::vector<float>& scores,
+                             const float score_threshold, const float nms_threshold,
+                             CV_OUT std::vector<int>& indices,
+                             const float eta = 1.f, const int top_k = 0);
 
 //! @}
 CV__DNN_EXPERIMENTAL_NS_END
diff --git a/modules/dnn/perf/perf_net.cpp b/modules/dnn/perf/perf_net.cpp
index 9d5acf04e3..c05a7088cd 100644
--- a/modules/dnn/perf/perf_net.cpp
+++ b/modules/dnn/perf/perf_net.cpp
@@ -121,7 +121,9 @@ PERF_TEST_P_(DNNTestNetwork, Inception_5h)
 
 PERF_TEST_P_(DNNTestNetwork, ENet)
 {
-    if (backend == DNN_BACKEND_INFERENCE_ENGINE) throw SkipTestException("");
+    if ((backend == DNN_BACKEND_INFERENCE_ENGINE) ||
+        (backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL_FP16))
+        throw SkipTestException("");
     processNet("dnn/Enet-model-best.net", "", "enet.yml",
             Mat(cv::Size(512, 256), CV_32FC3));
 }
@@ -232,7 +234,8 @@ const tuple<DNNBackend, DNNTarget> testCases[] = {
     tuple<DNNBackend, DNNTarget>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL_FP16),
 #endif
     tuple<DNNBackend, DNNTarget>(DNN_BACKEND_DEFAULT, DNN_TARGET_CPU),
-    tuple<DNNBackend, DNNTarget>(DNN_BACKEND_DEFAULT, DNN_TARGET_OPENCL)
+    tuple<DNNBackend, DNNTarget>(DNN_BACKEND_DEFAULT, DNN_TARGET_OPENCL),
+    tuple<DNNBackend, DNNTarget>(DNN_BACKEND_DEFAULT, DNN_TARGET_OPENCL_FP16)
 };
 
 INSTANTIATE_TEST_CASE_P(/*nothing*/, DNNTestNetwork, testing::ValuesIn(testCases));
diff --git a/modules/dnn/src/dnn.cpp b/modules/dnn/src/dnn.cpp
index 3f753ecec6..78952f62fb 100644
--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@@ -62,6 +62,10 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
 // this option is useful to run valgrind memory errors detection
 static bool DNN_DISABLE_MEMORY_OPTIMIZATIONS = utils::getConfigurationParameterBool("OPENCV_DNN_DISABLE_MEMORY_OPTIMIZATIONS", false);
 
+#ifdef HAVE_OPENCL
+static bool DNN_OPENCL_ALLOW_ALL_DEVICES = utils::getConfigurationParameterBool("OPENCV_DNN_OPENCL_ALLOW_ALL_DEVICES", false);
+#endif
+
 using std::vector;
 using std::map;
 using std::make_pair;
@@ -497,7 +501,7 @@ public:
         }
     }
 
-    void reuseOrCreate(const MatShape& shape, const LayerPin& lp, Mat& dst, bool forceCreate)
+    void reuseOrCreate(const MatShape& shape, const LayerPin& lp, Mat& dst, bool forceCreate, bool use_half)
     {
         if (!DNN_DISABLE_MEMORY_OPTIMIZATIONS && !forceCreate)
         {
@@ -538,14 +542,14 @@ public:
         {
             // if dst already has been allocated with total(shape) elements,
             // it won't be recrreated and pointer of dst.data remains the same.
-            dst.create(shape, CV_32F);
+            dst.create(shape, use_half ? CV_16S : CV_32F);
             addHost(lp, dst);
         }
     }
 
     void allocateBlobsForLayer(LayerData &ld, const LayerShapes& layerShapes,
                                std::vector<LayerPin>& pinsForInternalBlobs,
-                               bool forceCreate = false)
+                               bool forceCreate = false, bool use_half = false)
     {
         CV_TRACE_FUNCTION();
 
@@ -616,7 +620,7 @@ public:
                         reuse(ld.inputBlobsId[0], blobPin);
                     }
                     else
-                        reuseOrCreate(shapes[index], blobPin, *blobs[index], forceCreate);
+                        reuseOrCreate(shapes[index], blobPin, *blobs[index], forceCreate, use_half);
                 }
             }
         }
@@ -654,7 +658,7 @@ static Ptr<BackendWrapper> wrapMat(int backendId, int targetId, cv::Mat& m)
     {
         if (targetId == DNN_TARGET_CPU)
             return Ptr<BackendWrapper>();
-        else if (targetId == DNN_TARGET_OPENCL)
+        else if (IS_DNN_OPENCL_TARGET(targetId))
             return OpenCLBackendWrapper::create(m);
         else
             CV_Error(Error::StsNotImplemented, "Unknown target identifier");
@@ -719,6 +723,7 @@ struct Net::Impl
     bool netWasAllocated;
     bool fusion;
     std::vector<int64> layersTimings;
+    Mat output_blob;
 
     Ptr<BackendWrapper> wrap(Mat& host)
     {
@@ -735,7 +740,7 @@ struct Net::Impl
             Ptr<BackendWrapper> baseBuffer = backendWrappers[data];
             if (preferableBackend == DNN_BACKEND_DEFAULT)
             {
-                CV_Assert(preferableTarget == DNN_TARGET_OPENCL);
+                CV_Assert(IS_DNN_OPENCL_TARGET(preferableTarget));
                 return OpenCLBackendWrapper::create(baseBuffer, host);
             }
             else if (preferableBackend == DNN_BACKEND_HALIDE)
@@ -847,12 +852,22 @@ struct Net::Impl
 
         if (!netWasAllocated || this->blobsToKeep != blobsToKeep_)
         {
+            if (preferableBackend == DNN_BACKEND_DEFAULT && IS_DNN_OPENCL_TARGET(preferableTarget))
 #ifndef HAVE_OPENCL
-            if (preferableBackend == DNN_BACKEND_DEFAULT && preferableTarget == DNN_TARGET_OPENCL)
             {
-                CV_LOG_WARNING(NULL, "DNN: OpenCL target is not available in this OpenCV build, switching to CPU.")
+                CV_LOG_WARNING(NULL, "DNN: OpenCL target is not available in this OpenCV build, switching to CPU.");
                 preferableTarget = DNN_TARGET_CPU;
             }
+#else
+            {
+                if (!DNN_OPENCL_ALLOW_ALL_DEVICES
+                    && !(ocl::Device::getDefault().isIntel() && ocl::Device::getDefault().type() == ocl::Device::TYPE_GPU) // Current implementation is only valid for Intel GPU (#11494)
+                    )
+                {
+                    CV_LOG_WARNING(NULL, "DNN: OpenCL target is not supported with current OpenCL device (tested with Intel GPUs only), switching to CPU.");
+                    preferableTarget = DNN_TARGET_CPU;
+                }
+            }
 #endif
             clear();
 
@@ -1022,7 +1037,7 @@ struct Net::Impl
     {
         CV_TRACE_FUNCTION();
         if (preferableBackend == DNN_BACKEND_DEFAULT)
-            CV_Assert(preferableTarget == DNN_TARGET_CPU || preferableTarget == DNN_TARGET_OPENCL);
+            CV_Assert(preferableTarget == DNN_TARGET_CPU || IS_DNN_OPENCL_TARGET(preferableTarget));
         else if (preferableBackend == DNN_BACKEND_HALIDE)
             initHalideBackend();
         else if (preferableBackend == DNN_BACKEND_INFERENCE_ENGINE)
@@ -1357,7 +1372,9 @@ struct Net::Impl
 
         std::vector<LayerPin> pinsForInternalBlobs;
         blobManager.allocateBlobsForLayer(ld, layerShapesIt->second, pinsForInternalBlobs,
-                                          preferableBackend == DNN_BACKEND_INFERENCE_ENGINE);
+                                          preferableBackend == DNN_BACKEND_INFERENCE_ENGINE,
+                                          preferableBackend == DNN_BACKEND_DEFAULT &&
+                                          preferableTarget == DNN_TARGET_OPENCL_FP16);
         ld.outputBlobsWrappers.resize(ld.outputBlobs.size());
         for (int i = 0; i < ld.outputBlobs.size(); ++i)
         {
@@ -1427,7 +1444,7 @@ struct Net::Impl
             // some other layers.
 
             // TODO: OpenCL target support more fusion styles.
-            if ( preferableBackend == DNN_BACKEND_DEFAULT && preferableTarget == DNN_TARGET_OPENCL &&
+            if ( preferableBackend == DNN_BACKEND_DEFAULT && IS_DNN_OPENCL_TARGET(preferableTarget) &&
                  (!cv::ocl::useOpenCL() || (ld.layerInstance->type != "Convolution" &&
                  ld.layerInstance->type != "MVN")) )
                 continue;
@@ -1466,8 +1483,8 @@ struct Net::Impl
                     continue;  // Go to the next layer.
 
                 // For now, OpenCL target support fusion with activation of ReLU/ChannelsPReLU/Power/Tanh
-                if ( preferableTarget != DNN_TARGET_OPENCL ||
-                        (preferableTarget == DNN_TARGET_OPENCL &&
+                if ( !IS_DNN_OPENCL_TARGET(preferableTarget) ||
+                     (IS_DNN_OPENCL_TARGET(preferableTarget) &&
                          nextData &&
                         ((nextData->type == "ReLU") ||
                          (nextData->type == "ChannelsPReLU") ||
@@ -1490,7 +1507,7 @@ struct Net::Impl
                         ld.outputBlobs = layers[lpNext.lid].outputBlobs;
                         ld.outputBlobsWrappers = layers[lpNext.lid].outputBlobsWrappers;
 
-                        if ( preferableTarget == DNN_TARGET_OPENCL )
+                        if ( IS_DNN_OPENCL_TARGET(preferableTarget) )
                         {
                             if ( !activData->consumers.empty() )
                             {
@@ -1502,7 +1519,7 @@ struct Net::Impl
                 }
 
                 // fuse convlution layer followed by eltwise + relu
-                if ( preferableTarget == DNN_TARGET_OPENCL )
+                if ( IS_DNN_OPENCL_TARGET(preferableTarget) )
                 {
                     Ptr<EltwiseLayer> nextEltwiseLayer;
                     if( nextData )
@@ -1715,6 +1732,13 @@ struct Net::Impl
         for(int i = 0; i < layers[0].outputBlobs.size(); i++)
         {
             CV_Assert(layers[0].outputBlobs[i].total());
+            if (layers[0].outputBlobs[i].depth() == CV_32F &&
+                preferableBackend == DNN_BACKEND_DEFAULT &&
+                preferableTarget == DNN_TARGET_OPENCL_FP16)
+            {
+                Mat mat = layers[0].outputBlobs[i].clone();
+                convertFp16(mat, layers[0].outputBlobs[i]);
+            }
             inputShapes.push_back(shape(layers[0].outputBlobs[i]));
         }
         LayersShapesMap layersShapes;
@@ -1760,7 +1784,7 @@ struct Net::Impl
         {
             if( !ld.skip )
             {
-                if (preferableBackend == DNN_BACKEND_DEFAULT && preferableTarget == DNN_TARGET_OPENCL)
+                if (preferableBackend == DNN_BACKEND_DEFAULT && IS_DNN_OPENCL_TARGET(preferableTarget))
                 {
                     std::vector<UMat> umat_outputBlobs = OpenCLBackendWrapper::getUMatVector(ld.outputBlobsWrappers);
                     layer->forward(OpenCLBackendWrapper::getUMatVector(ld.inputBlobsWrappers),
@@ -1925,7 +1949,14 @@ struct Net::Impl
             // Transfer data to CPU if it's require.
             ld.outputBlobsWrappers[pin.oid]->copyToHost();
         }
-        return ld.outputBlobs[pin.oid];
+
+        if (ld.outputBlobs[pin.oid].depth() == CV_16S)
+        {
+            convertFp16(ld.outputBlobs[pin.oid], output_blob);
+            return output_blob;
+        }
+        else
+            return ld.outputBlobs[pin.oid];
     }
 
     Mat getBlob(String outputName)
@@ -2068,7 +2099,7 @@ void Net::forward(OutputArrayOfArrays outputBlobs, const String& outputName)
 
     if (outputBlobs.isUMat())
     {
-        outputBlobs.assign(ld.outputBlobs[pin.oid].getUMat(ACCESS_RW));
+        outputBlobs.assign(impl->getBlob(layerName).getUMat(ACCESS_RW));
     }
     else if (outputBlobs.isMat())
     {
@@ -2084,17 +2115,33 @@ void Net::forward(OutputArrayOfArrays outputBlobs, const String& outputName)
                 ld.outputBlobsWrappers[i]->copyToHost();
             }
         }
-        std::vector<Mat> & outputvec = *(std::vector<Mat> *)outputBlobs.getObj();
-        outputvec = ld.outputBlobs;
+        if (ld.outputBlobs[0].depth() == CV_32F)
+        {
+            std::vector<Mat> & outputvec = *(std::vector<Mat> *)outputBlobs.getObj();
+            outputvec = ld.outputBlobs;
+        } else {
+            std::vector<Mat> & outputvec = *(std::vector<Mat> *)outputBlobs.getObj();
+            outputvec.resize(ld.outputBlobs.size());
+            for (int i = 0; i < outputvec.size(); i++)
+                convertFp16(ld.outputBlobs[i], outputvec[i]);
+        }
     }
     else if (outputBlobs.isUMatVector())
     {
         std::vector<UMat> & outputvec = *(std::vector<UMat> *)outputBlobs.getObj();
 
         if (impl->preferableBackend == DNN_BACKEND_DEFAULT &&
-            impl->preferableTarget == DNN_TARGET_OPENCL)
+            IS_DNN_OPENCL_TARGET(impl->preferableTarget))
         {
-            outputvec = OpenCLBackendWrapper::getUMatVector(ld.outputBlobsWrappers);
+            if (impl->preferableTarget == DNN_TARGET_OPENCL)
+                outputvec = OpenCLBackendWrapper::getUMatVector(ld.outputBlobsWrappers);
+            else if (impl->preferableTarget == DNN_TARGET_OPENCL_FP16)
+            {
+                std::vector<UMat> out_vec = OpenCLBackendWrapper::getUMatVector(ld.outputBlobsWrappers);
+                outputvec.resize(out_vec.size());
+                for (int i = 0; i < out_vec.size(); i++)
+                    convertFp16(out_vec[i], outputvec[i]);
+            }
         }
         else
         {
@@ -2182,6 +2229,16 @@ void Net::setPreferableTarget(int targetId)
     if( impl->preferableTarget != targetId )
     {
         impl->preferableTarget = targetId;
+        if (IS_DNN_OPENCL_TARGET(targetId))
+        {
+#ifndef HAVE_OPENCL
+            impl->preferableTarget = DNN_TARGET_CPU;
+#else
+            bool fp16 = ocl::Device::getDefault().isExtensionSupported("cl_khr_fp16");
+            if (!fp16 && targetId == DNN_TARGET_OPENCL_FP16)
+                impl->preferableTarget = DNN_TARGET_OPENCL;
+#endif
+        }
         impl->netWasAllocated = false;
         impl->clear();
     }
@@ -2210,7 +2267,17 @@ void Net::setInput(InputArray blob, const String& name)
     ld.outputBlobs.resize( std::max(pin.oid+1, (int)ld.requiredOutputs.size()) );
     ld.outputBlobsWrappers.resize(ld.outputBlobs.size());
     MatShape prevShape = shape(ld.outputBlobs[pin.oid]);
-    Mat blob_ = blob.getMat();
+    Mat blob_;
+    if (impl->preferableBackend == DNN_BACKEND_DEFAULT &&
+        impl->preferableTarget == DNN_TARGET_OPENCL_FP16)
+    {
+        Mat blob_mat = blob.getMat();
+        convertFp16(blob_mat, blob_);
+    }
+    else
+    {
+        blob_ = blob.getMat();
+    }
     bool oldShape = prevShape == shape(blob_);
     if (oldShape)
     {
@@ -2735,6 +2802,43 @@ void Layer::forward_fallback(InputArrayOfArrays inputs_arr, OutputArrayOfArrays
     CV_TRACE_FUNCTION();
     CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
+    if (preferableTarget == DNN_TARGET_OPENCL_FP16 && inputs_arr.depth() == CV_16S)
+    {
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+        std::vector<UMat> internals;
+
+        std::vector<UMat> orig_inputs;
+        std::vector<UMat> orig_outputs;
+        std::vector<UMat> orig_internals;
+
+        inputs_arr.getUMatVector(orig_inputs);
+        outputs_arr.getUMatVector(orig_outputs);
+        internals_arr.getUMatVector(orig_internals);
+
+        inputs.resize(orig_inputs.size());
+        for (size_t i = 0; i < orig_inputs.size(); i++)
+            convertFp16(orig_inputs[i], inputs[i]);
+
+        outputs.resize(orig_outputs.size());
+        for (size_t i = 0; i < orig_outputs.size(); i++)
+            outputs[i].create(shape(orig_outputs[i]), CV_32F);
+
+        internals.resize(orig_internals.size());
+        for (size_t i = 0; i < orig_internals.size(); i++)
+            internals[i].create(shape(orig_internals[i]), CV_32F);
+
+        forward(inputs, outputs, internals);
+
+        for (size_t i = 0; i < outputs.size(); i++)
+            convertFp16(outputs[i], orig_outputs[i]);
+
+        // sync results back
+        outputs_arr.assign(orig_outputs);
+        internals_arr.assign(orig_internals);
+        return;
+    }
+
     std::vector<Mat> inpvec;
     std::vector<Mat> outputs;
     std::vector<Mat> internals;
diff --git a/modules/dnn/src/layers/batch_norm_layer.cpp b/modules/dnn/src/layers/batch_norm_layer.cpp
index c2906b63f5..2005254cd3 100644
--- a/modules/dnn/src/layers/batch_norm_layer.cpp
+++ b/modules/dnn/src/layers/batch_norm_layer.cpp
@@ -120,12 +120,16 @@ public:
         std::vector<UMat> inputs;
         std::vector<UMat> outputs;
 
+        bool use_half = (inputs_.depth() == CV_16S);
         inputs_.getUMatVector(inputs);
         outputs_.getUMatVector(outputs);
 
         CV_Assert(blobs.size() >= 2);
         CV_Assert(inputs.size() == 1);
 
+        if (use_half && inputs[0].dims == 2)
+            return false;
+
         if (umat_weight.empty())
         {
             umat_weight = weights_.getUMat(ACCESS_READ);
@@ -139,6 +143,7 @@ public:
         int rows = inpBlob.dims > 2 ? inpBlob.size[2] : 1;
         int cols = inpBlob.dims > 2 ? inpBlob.size[3] : 1;
 
+        String opts = (use_half) ? " -DDtype=half" : " -DDtype=float";
         for (size_t ii = 0; ii < outputs.size(); ii++)
         {
             if (inpBlob.dims == 2)
@@ -154,8 +159,12 @@ public:
                 UMat src = inputs[ii].reshape(1, s.size(), &s[0]);
                 UMat dst = outputs[ii].reshape(1, s.size(), &s[0]);
                 int number = (s[1] % 8 == 0) ? 8 : ((s[1] % 4 == 0) ? 4 : 1);
-                String buildopt = format("-DNUM=%d", number);
+                String buildopt = format("-DNUM=%d", number) + opts;
                 String kname = format("batch_norm%d", number);
+                if (number == 1)
+                    buildopt += format(" -Dconvert_T=convert_%s", use_half ? "half" : "float");
+                else
+                    buildopt += format(" -Dconvert_T=convert_%s%d", use_half ? "half" : "float", number);
                 ocl::Kernel kernel(kname.c_str(), ocl::dnn::batchnorm_oclsrc, buildopt);
                 if (kernel.empty())
                     return false;
@@ -181,7 +190,7 @@ public:
         CV_TRACE_FUNCTION();
         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
-        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
                    OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
 
diff --git a/modules/dnn/src/layers/blank_layer.cpp b/modules/dnn/src/layers/blank_layer.cpp
index 0794eff9af..847b6228df 100644
--- a/modules/dnn/src/layers/blank_layer.cpp
+++ b/modules/dnn/src/layers/blank_layer.cpp
@@ -95,7 +95,7 @@ public:
         CV_TRACE_FUNCTION();
         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
-        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
                    OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
 
diff --git a/modules/dnn/src/layers/concat_layer.cpp b/modules/dnn/src/layers/concat_layer.cpp
index 172d0a076c..a72b28215b 100644
--- a/modules/dnn/src/layers/concat_layer.cpp
+++ b/modules/dnn/src/layers/concat_layer.cpp
@@ -128,14 +128,14 @@ public:
             for( i = 0; i < ninputs; i++ )
             {
                 Mat& inp = *inputs[i];
-                CV_Assert( inp.isContinuous() && inp.type() == CV_32F &&
+                CV_Assert( inp.isContinuous() && (inp.type() == CV_32F || inp.type() == CV_16S) &&
                            inp.dims == 4 && inp.size[0] == output.size[0] &&
                            inp.size[2] == output.size[2] &&
                            inp.size[3] == output.size[3] );
                 nchannels += inp.size[1];
             }
             CV_Assert( nchannels == output.size[1] );
-            CV_Assert( output.isContinuous() && output.type() == CV_32F );
+            CV_Assert( output.isContinuous() && (output.type() == CV_32F || output.type() == CV_16S) );
 
             cc.chptrs.resize(nchannels*batchsz);
 
@@ -186,6 +186,7 @@ public:
         std::vector<UMat> inputs;
         std::vector<UMat> outputs;
 
+        bool use_half = (inps.depth() == CV_16S);
         inps.getUMatVector(inputs);
         outs.getUMatVector(outputs);
 
@@ -199,11 +200,12 @@ public:
         int num_concats = total(shape(inputs[0]), 0, cAxis);
         int offset_concat_axis = 0;
         UMat& outMat = outputs[0];
-        String buildopt = String("-DDtype=") + ocl::typeToStr(inputs[0].type()) + String(" ");
+        String buildopt = format(" -DDtype=%s", (use_half) ? "half" : "float");
+        String kname = format("concat_%s", use_half ? "half" : "float");
 
         for (size_t i = 0; i < inputs.size(); i++)
         {
-            ocl::Kernel kernel("concat", ocl::dnn::concat_oclsrc, buildopt);
+            ocl::Kernel kernel(kname.c_str(), ocl::dnn::concat_oclsrc, buildopt);
             if (kernel.empty())
                 return false;
 
@@ -235,7 +237,7 @@ public:
         CV_TRACE_FUNCTION();
         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
-        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
                    OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
 
diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp
index 2bb96f92b5..96a9d5b0e3 100644
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@@ -94,7 +94,7 @@ public:
         CV_Assert(blobs[0].dims == 4 && blobs[0].size[3] == kernel.width && blobs[0].size[2] == kernel.height);
 
         const Mat &input = *inputs[0];
-        CV_Assert(input.dims == 4 && (input.type() == CV_32F || input.type() == CV_64F));
+        CV_Assert(input.dims == 4 && (input.type() == CV_32F || input.type() == CV_64F || input.type() == CV_16S));
         for (size_t i = 0; i < inputs.size(); i++)
         {
             CV_Assert(inputs[i]->type() == input.type());
@@ -288,7 +288,7 @@ public:
         newActiv = true;
         activType = OCL4DNN_CONV_FUSED_ACTIV_NONE;
 
-        if (preferableTarget == DNN_TARGET_OPENCL)
+        if (IS_DNN_OPENCL_TARGET(preferableTarget))
         {
             Ptr<PowerLayer> activ_power = activ.dynamicCast<PowerLayer>();
             if (!activ_power.empty())
@@ -842,6 +842,7 @@ public:
         std::vector<UMat> inputs;
         std::vector<UMat> outputs;
 
+        bool use_half = (inps.depth() == CV_16S);
         inps.getUMatVector(inputs);
         outs.getUMatVector(outputs);
 
@@ -860,6 +861,7 @@ public:
             config.dilation = dilation;
             config.group = inputs[0].size[1] / umat_blobs[0].size[1];
             config.bias_term = (hasBias()) ? true : false;
+            config.use_half = use_half;
 
             convolutionOp = Ptr<OCL4DNNConvSpatial<float> >(new OCL4DNNConvSpatial<float>(config));
         }
@@ -964,7 +966,7 @@ public:
         CV_TRACE_FUNCTION();
         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
-        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
                    OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
 
@@ -1360,6 +1362,9 @@ public:
         std::vector<UMat> outputs;
         std::vector<UMat> internals;
 
+        if (inputs_.depth() == CV_16S)
+            return false;
+
         inputs_.getUMatVector(inputs);
         outputs_.getUMatVector(outputs);
         internals_.getUMatVector(internals);
@@ -1450,7 +1455,7 @@ public:
         CV_TRACE_FUNCTION();
         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
-        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
                    OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
 
diff --git a/modules/dnn/src/layers/detection_output_layer.cpp b/modules/dnn/src/layers/detection_output_layer.cpp
index 5c4be180a1..44f7b32853 100644
--- a/modules/dnn/src/layers/detection_output_layer.cpp
+++ b/modules/dnn/src/layers/detection_output_layer.cpp
@@ -307,8 +307,24 @@ public:
         std::vector<UMat> inputs;
         std::vector<UMat> outputs;
 
-        inps.getUMatVector(inputs);
-        outs.getUMatVector(outputs);
+        bool use_half = (inps.depth() == CV_16S);
+        if (use_half)
+        {
+            std::vector<UMat> orig_inputs;
+            std::vector<UMat> orig_outputs;
+
+            inps.getUMatVector(orig_inputs);
+            outs.getUMatVector(orig_outputs);
+
+            inputs.resize(orig_inputs.size());
+            for (size_t i = 0; i < orig_inputs.size(); i++)
+                convertFp16(orig_inputs[i], inputs[i]);
+        }
+        else
+        {
+            inps.getUMatVector(inputs);
+            outs.getUMatVector(outputs);
+        }
 
         std::vector<LabelBBox> allDecodedBBoxes;
         std::vector<Mat> allConfidenceScores;
@@ -342,7 +358,13 @@ public:
         {
             // Set confidences to zeros.
             Range ranges[] = {Range::all(), Range::all(), Range::all(), Range(2, 3)};
-            outputs[0](ranges).setTo(0);
+            if (use_half)
+            {
+                std::vector<UMat> orig_outputs;
+                outs.getUMatVector(orig_outputs);
+                orig_outputs[0](ranges).setTo(0);
+            } else
+                outputs[0](ranges).setTo(0);
             return true;
         }
         int outputShape[] = {1, 1, (int)numKept, 7};
@@ -360,9 +382,23 @@ public:
             }
             CV_Assert(count == numKept);
         }
-        outputs.clear();
-        outputs.push_back(umat);
-        outs.assign(outputs);
+
+        if (use_half)
+        {
+            UMat half_umat;
+            convertFp16(umat, half_umat);
+
+            std::vector<UMat> orig_outputs;
+            outs.getUMatVector(orig_outputs);
+            orig_outputs.clear();
+            orig_outputs.push_back(half_umat);
+            outs.assign(orig_outputs);
+        } else {
+            outputs.clear();
+            outputs.push_back(umat);
+            outs.assign(outputs);
+        }
+
         return true;
     }
 #endif
@@ -372,7 +408,7 @@ public:
         CV_TRACE_FUNCTION();
         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
-        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
                    OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
 
diff --git a/modules/dnn/src/layers/elementwise_layers.cpp b/modules/dnn/src/layers/elementwise_layers.cpp
index 8600967de4..a24b913ba4 100644
--- a/modules/dnn/src/layers/elementwise_layers.cpp
+++ b/modules/dnn/src/layers/elementwise_layers.cpp
@@ -176,7 +176,7 @@ public:
     {
         CV_TRACE_FUNCTION();
 
-        CV_OCL_RUN((this->preferableTarget == DNN_TARGET_OPENCL) &&
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(this->preferableTarget) &&
                    OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                    func.applyOCL(inputs_arr, outputs_arr, internals_arr))
 
@@ -223,7 +223,12 @@ public:
 #ifdef HAVE_OPENCL
 static String oclGetTMacro(const UMat &m)
 {
-    return String("-DT=") + ocl::typeToStr(m.type()) + String(" ");
+    String str_name = ocl::typeToStr(m.type());
+
+    if (str_name == "short")
+        str_name = "half";
+
+    return format("-DT=%s -Dconvert_T=convert_%s ", str_name.c_str(), str_name.c_str());
 }
 #endif
 
@@ -516,8 +521,28 @@ struct SigmoidFunctor
 #ifdef HAVE_OPENCL
     bool applyOCL(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
     {
-        // TODO: implement OCL version
-        return false;
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+
+        inps.getUMatVector(inputs);
+        outs.getUMatVector(outputs);
+        String buildopt = oclGetTMacro(inputs[0]);
+
+        for (size_t i = 0; i < inputs.size(); i++)
+        {
+            UMat& src = inputs[i];
+            UMat& dst = outputs[i];
+
+            ocl::Kernel kernel("SigmoidForward", ocl::dnn::activations_oclsrc, buildopt);
+            kernel.set(0, (int)src.total());
+            kernel.set(1, ocl::KernelArg::PtrReadOnly(src));
+            kernel.set(2, ocl::KernelArg::PtrWriteOnly(dst));
+
+            size_t gSize = src.total();
+            CV_Assert(kernel.run(1, &gSize, NULL, false));
+        }
+
+        return true;
     }
 #endif
 
@@ -561,8 +586,28 @@ struct ELUFunctor
 #ifdef HAVE_OPENCL
     bool applyOCL(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
     {
-        // TODO: implement OCL version
-        return false;
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+
+        inps.getUMatVector(inputs);
+        outs.getUMatVector(outputs);
+        String buildopt = oclGetTMacro(inputs[0]);
+
+        for (size_t i = 0; i < inputs.size(); i++)
+        {
+            UMat& src = inputs[i];
+            UMat& dst = outputs[i];
+
+            ocl::Kernel kernel("ELUForward", ocl::dnn::activations_oclsrc, buildopt);
+            kernel.set(0, (int)src.total());
+            kernel.set(1, ocl::KernelArg::PtrReadOnly(src));
+            kernel.set(2, ocl::KernelArg::PtrWriteOnly(dst));
+
+            size_t gSize = src.total();
+            CV_Assert(kernel.run(1, &gSize, NULL, false));
+        }
+
+        return true;
     }
 #endif
 
@@ -604,8 +649,28 @@ struct AbsValFunctor
 #ifdef HAVE_OPENCL
     bool applyOCL(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
     {
-        // TODO: implement OCL version
-        return false;
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+
+        inps.getUMatVector(inputs);
+        outs.getUMatVector(outputs);
+        String buildopt = oclGetTMacro(inputs[0]);
+
+        for (size_t i = 0; i < inputs.size(); i++)
+        {
+            UMat& src = inputs[i];
+            UMat& dst = outputs[i];
+
+            ocl::Kernel kernel("AbsValForward", ocl::dnn::activations_oclsrc, buildopt);
+            kernel.set(0, (int)src.total());
+            kernel.set(1, ocl::KernelArg::PtrReadOnly(src));
+            kernel.set(2, ocl::KernelArg::PtrWriteOnly(dst));
+
+            size_t gSize = src.total();
+            CV_Assert(kernel.run(1, &gSize, NULL, false));
+        }
+
+        return true;
     }
 #endif
 
diff --git a/modules/dnn/src/layers/eltwise_layer.cpp b/modules/dnn/src/layers/eltwise_layer.cpp
index 58a651e628..39961abb5f 100644
--- a/modules/dnn/src/layers/eltwise_layer.cpp
+++ b/modules/dnn/src/layers/eltwise_layer.cpp
@@ -271,6 +271,9 @@ public:
         std::vector<UMat> inputs;
         std::vector<UMat> outputs;
 
+        if (inputs_.depth() == CV_16S && op != SUM)
+            return false;
+
         inputs_.getUMatVector(inputs);
         outputs_.getUMatVector(outputs);
 
@@ -284,10 +287,15 @@ public:
                     {
                         size_t localsize[] = { 128 };
                         size_t globalsize[] = { (size_t)channels / 4 * localsize[0] };
+                        String opts;
+                        if (inputs_.depth() == CV_16S)
+                            opts = " -DDtype=half -DDtype4=half4 -DDtype8=half8";
+                        else
+                            opts = " -DDtype=float -DDtype4=float4 -DDtype8=float8";
 
                         for (int i = 0; i < (inputs.size() - 1); ++i)
                         {
-                            String buildopt = format("-DLOOP=%d", i);
+                            String buildopt = format("-DLOOP=%d", i) + opts;
                             ocl::Kernel kernel("op_sum4", ocl::dnn::eltwise_oclsrc, buildopt);
                             int idx = 0;
                             UMat inpMat = (i == 0) ? inputs[0] : UMat();
@@ -306,6 +314,9 @@ public:
                     }
                     else
                     {
+                        if (inputs_.depth() == CV_16S)
+                            return false;
+
                         float coeff1 = coeffs.empty() ? 1.f : coeffs[0];
                         float coeff2 = coeffs.empty() ? 1.f : coeffs[1];
                         UMat mul0, mul1;
@@ -343,7 +354,7 @@ public:
         CV_TRACE_FUNCTION();
         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
-        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
                    OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
 
diff --git a/modules/dnn/src/layers/flatten_layer.cpp b/modules/dnn/src/layers/flatten_layer.cpp
index 1df1681a46..f737ac242b 100644
--- a/modules/dnn/src/layers/flatten_layer.cpp
+++ b/modules/dnn/src/layers/flatten_layer.cpp
@@ -140,7 +140,7 @@ public:
         CV_TRACE_FUNCTION();
         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
-        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
                    outputs_arr.isUMatVector() &&
                    OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
diff --git a/modules/dnn/src/layers/fully_connected_layer.cpp b/modules/dnn/src/layers/fully_connected_layer.cpp
index 9ee7e98023..d459e65615 100644
--- a/modules/dnn/src/layers/fully_connected_layer.cpp
+++ b/modules/dnn/src/layers/fully_connected_layer.cpp
@@ -64,6 +64,7 @@ public:
 #ifdef HAVE_OPENCL
     Ptr<OCL4DNNInnerProduct<float> > innerProductOp;
     std::vector<UMat> umat_blobs;
+    std::vector<UMat> half_blobs;
 #endif
 
     FullyConnectedLayerImpl(const LayerParams& params)
@@ -277,6 +278,7 @@ public:
         std::vector<UMat> inputs;
         std::vector<UMat> outputs;
 
+        bool use_half = (inps.depth() == CV_16S);
         inps.getUMatVector(inputs);
         outs.getUMatVector(outputs);
 
@@ -293,6 +295,17 @@ public:
             config.bias_term = bias;
             config.M = outerSize;
             config.K = innerSize;
+            config.use_half = use_half;
+
+            if (use_half)
+            {
+                half_blobs.resize(umat_blobs.size());
+                for (int i = 0; i < umat_blobs.size(); i++)
+                {
+                    if (!umat_blobs[i].empty())
+                        convertFp16(umat_blobs[i], half_blobs[i]);
+                }
+            }
 
             innerProductOp = Ptr<OCL4DNNInnerProduct<float> >(new OCL4DNNInnerProduct<float>(config));
         }
@@ -309,13 +322,15 @@ public:
             dstMat = outputs[i].reshape(1, outshape.size(), &outshape[0]);
             dstMat.setTo(0.0f);
 
-            if (!innerProductOp->Forward(srcMat, umat_blobs[0], (bias) ? umat_blobs[1] : UMat(), dstMat))
+            if (!innerProductOp->Forward(srcMat, (use_half) ? half_blobs[0] : umat_blobs[0],
+                                         (bias) ? (use_half ? half_blobs[1] : umat_blobs[1]) : UMat(),
+                                         dstMat))
             {
                 ret = false;
                 break;
             }
 
-            if (bias && (outerSize > 1))
+            if (!use_half && bias && (outerSize > 1))
             {
                 UMat& biases = umat_blobs[1];
                 cv::gemm(biasOnesMat, biases, 1, dstMat, 1, dstMat, 0);
@@ -353,7 +368,7 @@ public:
         CV_TRACE_FUNCTION();
         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
-        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
                    OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
 
diff --git a/modules/dnn/src/layers/lrn_layer.cpp b/modules/dnn/src/layers/lrn_layer.cpp
index 25eb1540b1..1b2a902af0 100644
--- a/modules/dnn/src/layers/lrn_layer.cpp
+++ b/modules/dnn/src/layers/lrn_layer.cpp
@@ -106,6 +106,7 @@ public:
         std::vector<UMat> inputs;
         std::vector<UMat> outputs;
 
+        bool use_half = (inps.depth() == CV_16S);
         inps.getUMatVector(inputs);
         outs.getUMatVector(outputs);
 
@@ -128,6 +129,7 @@ public:
             config.height = inputs[0].size[2];
             config.width = inputs[0].size[3];
             config.norm_by_size = normBySize;
+            config.use_half = use_half;
 
             lrnOp = Ptr<OCL4DNNLRN<float> >(new OCL4DNNLRN<float>(config));
         }
@@ -146,7 +148,7 @@ public:
 
         CV_Assert(inputs_arr.total() == outputs_arr.total());
 
-        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
                    OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
 
diff --git a/modules/dnn/src/layers/mvn_layer.cpp b/modules/dnn/src/layers/mvn_layer.cpp
index f948c718c2..647308ae0a 100644
--- a/modules/dnn/src/layers/mvn_layer.cpp
+++ b/modules/dnn/src/layers/mvn_layer.cpp
@@ -102,6 +102,9 @@ public:
     {
         UMat bnorm_weight = scale.empty() ? UMat() : scale.getUMat(ACCESS_READ);
         UMat bnorm_bias = shift.empty() ? UMat() : shift.getUMat(ACCESS_READ);
+        bool use_half = (inputs[0].depth() == CV_16S);
+        String opts = format(" -DT=%s -DT4=%s -Dconvert_T=%s", use_half ? "half" : "float",
+                             use_half ? "half4" : "float4", use_half ? "convert_half4" : "convert_float4");
 
         int splitDim = (acrossChannels) ? 1 : 2;
         for (size_t inpIdx = 0; inpIdx < inputs.size(); inpIdx++)
@@ -111,12 +114,11 @@ public:
             int newRows = total(shape(inpMat), 0, splitDim);
 
             MatShape s = shape(newRows, inpMat.total() / newRows);
-            UMat oneMat = UMat::ones(s[1], 1, CV_32F);
-            UMat meanMat = UMat(s[0], 1, CV_32F);
+            UMat meanMat = UMat(s[0], 1, (use_half) ? CV_16S : CV_32F);
             UMat tmpMat  = UMat(s[0], s[1], CV_32F);
             float alpha = 1.0f / s[1];
 
-            String buildopt = "-DNUM=4";
+            String buildopt = "-DNUM=4" + opts;
             ocl::Kernel k("mean_fuse4", ocl::dnn::mvn_oclsrc, buildopt);
             size_t localsize[] = { 128 };
             size_t globalsize[] = { (size_t)s[0] / 4 * localsize[0] };
@@ -167,13 +169,14 @@ public:
         int row_size = total(shape(inputs[0]), 0, splitDim);
         int plane_size = total(shape(inputs[0]), splitDim);
         if (normVariance && (row_size % 4 == 0) && (plane_size % 4 == 0))
-        {
-            bool ret = fast_forward_ocl(inputs, outputs);
-            return ret;
-        }
+            return fast_forward_ocl(inputs, outputs);
+
+        if (inputs[0].depth() == CV_16S)
+            return false;
 
         UMat bnorm_weight = scale.empty() ? UMat() : scale.getUMat(ACCESS_READ);
         UMat bnorm_bias = shift.empty() ? UMat() : shift.getUMat(ACCESS_READ);
+        String opts = format(" -DT=float -DT4=float4 -Dconvert_T=convert_float4");
 
         for (size_t inpIdx = 0; inpIdx < inputs.size(); inpIdx++)
         {
@@ -195,7 +198,7 @@ public:
 
             int number = (s[1] % 8 == 0) ? 8 : ((s[1] % 4 == 0) ? 4 : 1);
             size_t global[] = { (size_t)s[0], (size_t)(s[1] / number) };
-            String buildopt = format("-DNUM=%d", number);
+            String buildopt = format("-DNUM=%d", number) + opts;
             if (normVariance)
             {
                 String kname = format("calc_mean%d", number);
@@ -249,7 +252,7 @@ public:
         CV_TRACE_FUNCTION();
         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
-        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
                    OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
 
diff --git a/modules/dnn/src/layers/normalize_bbox_layer.cpp b/modules/dnn/src/layers/normalize_bbox_layer.cpp
index 5e8ed65157..e2fc2c9b27 100644
--- a/modules/dnn/src/layers/normalize_bbox_layer.cpp
+++ b/modules/dnn/src/layers/normalize_bbox_layer.cpp
@@ -87,6 +87,9 @@ public:
         std::vector<UMat> outputs;
         std::vector<UMat> internals;
 
+        if (inputs_.depth() == CV_16S)
+            return false;
+
         inputs_.getUMatVector(inputs);
         outputs_.getUMatVector(outputs);
         internals_.getUMatVector(internals);
@@ -162,7 +165,7 @@ public:
         CV_TRACE_FUNCTION();
         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
-        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
                    OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
 
diff --git a/modules/dnn/src/layers/permute_layer.cpp b/modules/dnn/src/layers/permute_layer.cpp
index 85ce8837bc..d4f756ced5 100644
--- a/modules/dnn/src/layers/permute_layer.cpp
+++ b/modules/dnn/src/layers/permute_layer.cpp
@@ -288,9 +288,11 @@ public:
         if (!_needsPermute)
             return false;
 
+        bool use_half = (inps.depth() == CV_16S);
+        String opts = format("-DDtype=%s", use_half ? "half" : "float");
         for (size_t i = 0; i < inputs.size(); i++)
         {
-            ocl::Kernel kernel("permute", ocl::dnn::permute_oclsrc);
+            ocl::Kernel kernel("permute", ocl::dnn::permute_oclsrc, opts);
 
             kernel.set(0, (int)_count);
             kernel.set(1, ocl::KernelArg::PtrReadOnly(inputs[i]));
@@ -313,7 +315,7 @@ public:
         CV_TRACE_FUNCTION();
         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
-        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
                    OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
 
diff --git a/modules/dnn/src/layers/pooling_layer.cpp b/modules/dnn/src/layers/pooling_layer.cpp
index bee9d5d62a..2bcce1d91e 100644
--- a/modules/dnn/src/layers/pooling_layer.cpp
+++ b/modules/dnn/src/layers/pooling_layer.cpp
@@ -147,6 +147,7 @@ public:
         std::vector<UMat> inputs;
         std::vector<UMat> outputs;
 
+        bool use_half = (inps.depth() == CV_16S);
         inps.getUMatVector(inputs);
         outs.getUMatVector(outputs);
 
@@ -164,6 +165,7 @@ public:
                                 (type == AVE ? LIBDNN_POOLING_METHOD_AVE :
                                                LIBDNN_POOLING_METHOD_STO);
             config.avePoolPaddedArea = avePoolPaddedArea;
+            config.use_half = use_half;
             poolOp = Ptr<OCL4DNNPool<float> >(new OCL4DNNPool<float>(config));
         }
 
@@ -189,7 +191,7 @@ public:
         CV_TRACE_FUNCTION();
         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
-        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
                    OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
 
diff --git a/modules/dnn/src/layers/prior_box_layer.cpp b/modules/dnn/src/layers/prior_box_layer.cpp
index 81a7392376..b854c2602a 100644
--- a/modules/dnn/src/layers/prior_box_layer.cpp
+++ b/modules/dnn/src/layers/prior_box_layer.cpp
@@ -316,6 +316,7 @@ public:
         std::vector<UMat> inputs;
         std::vector<UMat> outputs;
 
+        bool use_half = (inps.depth() == CV_16S);
         inps.getUMatVector(inputs);
         outs.getUMatVector(outputs);
 
@@ -340,9 +341,15 @@ public:
             heights.copyTo(umat_heights);
         }
 
+        String opts;
+        if (use_half)
+            opts = "-DDtype=half -DDtype4=half4 -Dconvert_T=convert_half4";
+        else
+            opts = "-DDtype=float -DDtype4=float4 -Dconvert_T=convert_float4";
+
         size_t nthreads = _layerHeight * _layerWidth;
+        ocl::Kernel kernel("prior_box", ocl::dnn::prior_box_oclsrc, opts);
 
-        ocl::Kernel kernel("prior_box", ocl::dnn::prior_box_oclsrc);
         kernel.set(0, (int)nthreads);
         kernel.set(1, (float)_stepX);
         kernel.set(2, (float)_stepY);
@@ -375,7 +382,7 @@ public:
 
         // set the variance.
         {
-            ocl::Kernel kernel("set_variance", ocl::dnn::prior_box_oclsrc);
+            ocl::Kernel kernel("set_variance", ocl::dnn::prior_box_oclsrc, opts);
             int offset = total(shape(outputs[0]), 2);
             size_t nthreads = _layerHeight * _layerWidth * _numPriors;
             kernel.set(0, (int)nthreads);
@@ -395,7 +402,7 @@ public:
         CV_TRACE_FUNCTION();
         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
-        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
                    OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
 
diff --git a/modules/dnn/src/layers/proposal_layer.cpp b/modules/dnn/src/layers/proposal_layer.cpp
index 7784e700ba..44671268a7 100644
--- a/modules/dnn/src/layers/proposal_layer.cpp
+++ b/modules/dnn/src/layers/proposal_layer.cpp
@@ -158,6 +158,9 @@ public:
         std::vector<UMat> outputs;
         std::vector<UMat> internals;
 
+        if (inputs_.depth() == CV_16S)
+            return false;
+
         inputs_.getUMatVector(inputs);
         outputs_.getUMatVector(outputs);
         internals_.getUMatVector(internals);
@@ -237,7 +240,7 @@ public:
         CV_TRACE_FUNCTION();
         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
-        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
                    OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
 
diff --git a/modules/dnn/src/layers/region_layer.cpp b/modules/dnn/src/layers/region_layer.cpp
index bcf038ce9f..125fa0d14d 100644
--- a/modules/dnn/src/layers/region_layer.cpp
+++ b/modules/dnn/src/layers/region_layer.cpp
@@ -127,7 +127,7 @@ public:
         std::vector<UMat> outputs;
 
         // TODO: implement a logistic activation to classification scores.
-        if (useLogistic)
+        if (useLogistic || inps.depth() == CV_16S)
             return false;
 
         inps.getUMatVector(inputs);
@@ -191,7 +191,7 @@ public:
         CV_TRACE_FUNCTION();
         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
-        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
                    OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
 
diff --git a/modules/dnn/src/layers/reorg_layer.cpp b/modules/dnn/src/layers/reorg_layer.cpp
index a34264f931..f6102c4ef5 100644
--- a/modules/dnn/src/layers/reorg_layer.cpp
+++ b/modules/dnn/src/layers/reorg_layer.cpp
@@ -96,9 +96,10 @@ public:
         std::vector<UMat> inputs;
         std::vector<UMat> outputs;
 
+        bool use_half = (inps.depth() == CV_16S);
         inps.getUMatVector(inputs);
         outs.getUMatVector(outputs);
-        String buildopt = String("-DDtype=") + ocl::typeToStr(inputs[0].type()) + String(" ");
+        String buildopt= format("-DDtype=%s ", use_half ? "half" : "float");
 
         for (size_t i = 0; i < inputs.size(); i++)
         {
@@ -134,7 +135,7 @@ public:
         CV_TRACE_FUNCTION();
         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
-        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
                    OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
 
diff --git a/modules/dnn/src/layers/reshape_layer.cpp b/modules/dnn/src/layers/reshape_layer.cpp
index f33ed423de..6b2100cdab 100644
--- a/modules/dnn/src/layers/reshape_layer.cpp
+++ b/modules/dnn/src/layers/reshape_layer.cpp
@@ -219,7 +219,7 @@ public:
         CV_TRACE_FUNCTION();
         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
-        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
                    OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
 
diff --git a/modules/dnn/src/layers/slice_layer.cpp b/modules/dnn/src/layers/slice_layer.cpp
index 826c640b5f..4b3a975b2a 100644
--- a/modules/dnn/src/layers/slice_layer.cpp
+++ b/modules/dnn/src/layers/slice_layer.cpp
@@ -181,6 +181,7 @@ public:
         std::vector<UMat> inputs;
         std::vector<UMat> outputs;
 
+        bool use_half = (inputs_.depth() == CV_16S);
         inputs_.getUMatVector(inputs);
         outputs_.getUMatVector(outputs);
 
@@ -188,6 +189,11 @@ public:
             (total(shape(outputs[0]), 2) % 4 != 0))
             return false;
 
+        String opts;
+        if (use_half)
+            opts = "-DDtype=half -DDtype4=half4 -DDtype8=half8";
+        else
+            opts = "-DDtype=float -DDtype4=float4 -DDtype8=float8";
         const UMat& inpMat = inputs[0];
         for (size_t i = 0; i < outputs.size(); i++)
         {
@@ -196,7 +202,7 @@ public:
             int rows = outputs[i].size[2];
             int cols = outputs[i].size[3];
 
-            ocl::Kernel kernel("slice", ocl::dnn::slice_oclsrc);
+            ocl::Kernel kernel("slice", ocl::dnn::slice_oclsrc, opts);
             size_t local[] = { 128 };
             size_t global[] = { (size_t)groups * channels / 4 * local[0] };
             int idx = 0;
@@ -222,7 +228,7 @@ public:
         CV_TRACE_FUNCTION();
         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
-        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
                    OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
 
diff --git a/modules/dnn/src/layers/softmax_layer.cpp b/modules/dnn/src/layers/softmax_layer.cpp
index 171215341b..c26028e000 100644
--- a/modules/dnn/src/layers/softmax_layer.cpp
+++ b/modules/dnn/src/layers/softmax_layer.cpp
@@ -99,15 +99,16 @@ public:
         softmaxOp.release();
     }
 
-    bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays itns)
+    bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
     {
         std::vector<UMat> inputs;
         std::vector<UMat> outputs;
         std::vector<UMat> internals;
 
-        inps.getUMatVector(inputs);
-        outs.getUMatVector(outputs);
-        itns.getUMatVector(internals);
+        bool use_half = (inputs_.depth() == CV_16S);
+        inputs_.getUMatVector(inputs);
+        outputs_.getUMatVector(outputs);
+        internals_.getUMatVector(internals);
 
         if (softmaxOp.empty())
         {
@@ -117,6 +118,7 @@ public:
             config.axis = axisRaw;
             config.channels = inputs[0].size[axisRaw];
             config.logsoftmax = logSoftMax;
+            config.use_half = use_half;
 
             softmaxOp = Ptr<OCL4DNNSoftmax<float> >(new OCL4DNNSoftmax<float>(config));
         }
@@ -128,15 +130,13 @@ public:
             return true;
 
         UMat& bufMat = internals[0];
-        src.copyTo(dstMat);
-
         int axis = clamp(axisRaw, src.dims);
         MatShape s = shape(src);
         size_t outerSize = total(s, 0, axis);
         size_t channels = src.size[axis];
         size_t innerSize = total(s, axis + 1);
 
-        String buildOpts = String("-DT=") + ocl::typeToStr(src.type());
+        String buildOpts = format("-DT=%s", use_half ? "half" : "float");
         ocl::Kernel kmax, ksub, ksum, kdiv;
 
         if (!kmax.create("kernel_channel_max", ocl::dnn::softmax_oclsrc, buildOpts))
@@ -152,38 +152,31 @@ public:
         if (!kdiv.create("kernel_channel_div", ocl::dnn::softmax_oclsrc, buildOpts))
             return false;
 
-        size_t wgSize = ocl::Device::getDefault().maxWorkGroupSize();
         size_t bufSize = internals[0].total();
         size_t totalSize = src.total();
 
-        // adjust local/global size
-        size_t internal_localSize[1] = { (bufSize == 1) ? 1 : wgSize };
-        size_t internal_globalSize[1] = { divUp(bufSize, (unsigned int)internal_localSize[0]) * internal_localSize[0] };
-
-        // adjust local/global size (total)
-        size_t total_localSize[1] = { (totalSize == 1) ? 1 : wgSize };
-        size_t total_globalSize[1] = { divUp(totalSize, (unsigned int)total_localSize[0]) * total_localSize[0] };
+        size_t internal_globalSize[1] = { bufSize };
+        size_t total_globalSize[1] = { totalSize };
 
         kmax.args((int)outerSize, (int)channels, (int)innerSize,
-                  ocl::KernelArg::PtrReadOnly(dstMat), ocl::KernelArg::PtrReadWrite(bufMat));
-        if (!kmax.run(1, internal_globalSize, internal_localSize, false))
+                  ocl::KernelArg::PtrReadOnly(src), ocl::KernelArg::PtrReadWrite(bufMat));
+        if (!kmax.run(1, internal_globalSize, NULL, false))
             return false;
 
         ksub.args((int)totalSize, (int)outerSize, (int)channels, (int)innerSize,
-                  ocl::KernelArg::PtrReadOnly(bufMat), ocl::KernelArg::PtrReadWrite(dstMat));
-        if (!ksub.run(1, total_globalSize, total_localSize, false))
+                  ocl::KernelArg::PtrReadOnly(bufMat),
+                  ocl::KernelArg::PtrReadOnly(src), ocl::KernelArg::PtrWriteOnly(dstMat));
+        if (!ksub.run(1, total_globalSize, NULL, false))
             return false;
 
-        cv::exp(dstMat, dstMat);
-
         ksum.args((int)outerSize, (int)channels, (int)innerSize,
                   ocl::KernelArg::PtrReadOnly(dstMat), ocl::KernelArg::PtrReadWrite(bufMat));
-        if (!ksum.run(1, internal_globalSize, internal_localSize, false))
+        if (!ksum.run(1, internal_globalSize, NULL, false))
             return false;
 
         kdiv.args((int)totalSize, (int)outerSize, (int)channels, (int)innerSize,
                   ocl::KernelArg::PtrReadOnly(bufMat), ocl::KernelArg::PtrReadWrite(dstMat));
-        if (!kdiv.run(1, total_globalSize, total_localSize, false))
+        if (!kdiv.run(1, total_globalSize, NULL, false))
             return false;
 
         return true;
@@ -195,7 +188,7 @@ public:
         CV_TRACE_FUNCTION();
         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
-        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
                    OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
 
diff --git a/modules/dnn/src/nms.cpp b/modules/dnn/src/nms.cpp
index 3adaef165d..62bda79c15 100644
--- a/modules/dnn/src/nms.cpp
+++ b/modules/dnn/src/nms.cpp
@@ -8,6 +8,8 @@
 #include "precomp.hpp"
 #include "nms.inl.hpp"
 
+#include <opencv2/imgproc.hpp>
+
 namespace cv
 {
 namespace dnn
@@ -28,6 +30,27 @@ void NMSBoxes(const std::vector<Rect>& bboxes, const std::vector<float>& scores,
     NMSFast_(bboxes, scores, score_threshold, nms_threshold, eta, top_k, indices, rectOverlap);
 }
 
+static inline float rotatedRectIOU(const RotatedRect& a, const RotatedRect& b)
+{
+    std::vector<Point2f> inter;
+    int res = rotatedRectangleIntersection(a, b, inter);
+    if (inter.empty() || res == INTERSECT_NONE)
+        return 0.0f;
+    if (res == INTERSECT_FULL)
+        return 1.0f;
+    float interArea = contourArea(inter);
+    return interArea / (a.size.area() + b.size.area() - interArea);
+}
+
+void NMSBoxes(const std::vector<RotatedRect>& bboxes, const std::vector<float>& scores,
+              const float score_threshold, const float nms_threshold,
+              std::vector<int>& indices, const float eta, const int top_k)
+{
+    CV_Assert(bboxes.size() == scores.size(), score_threshold >= 0,
+        nms_threshold >= 0, eta > 0);
+    NMSFast_(bboxes, scores, score_threshold, nms_threshold, eta, top_k, indices, rotatedRectIOU);
+}
+
 CV__DNN_EXPERIMENTAL_NS_END
 }// dnn
 }// cv
diff --git a/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp b/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
index 6eb60eef05..e0ce77e27a 100644
--- a/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
+++ b/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
@@ -59,7 +59,8 @@ struct OCL4DNNConvConfig
         stride(1, 1),
         dilation(1, 1),
         group(1),
-        bias_term(false)
+        bias_term(false),
+        use_half(false)
     {}
     MatShape in_shape;
     MatShape out_shape;
@@ -69,6 +70,7 @@ struct OCL4DNNConvConfig
     Size dilation;
     int group; // = 1;
     bool bias_term; // = false;
+    bool use_half; // = false;
 };
 
 typedef enum {
@@ -272,6 +274,8 @@ class OCL4DNNConvSpatial
         int32_t group_;
         bool bias_term_;
         UMat swizzled_weights_umat;
+        UMat weights_half;
+        UMat bias_half;
         UMat bottom_data2_;
 
         int32_t bottom_index_;
@@ -327,6 +331,7 @@ class OCL4DNNConvSpatial
         ocl4dnnFusedActiv_t fused_activ_;
         float power_;
         bool fused_eltwise_;
+        bool use_half_;
 };
 
 typedef enum {
@@ -345,7 +350,8 @@ struct OCL4DNNPoolConfig
         channels(0),
         pool_method(LIBDNN_POOLING_METHOD_MAX),
         global_pooling(false),
-        avePoolPaddedArea(false)
+        avePoolPaddedArea(true),
+        use_half(false)
     {}
     MatShape in_shape;
     MatShape out_shape;
@@ -358,6 +364,7 @@ struct OCL4DNNPoolConfig
     ocl4dnnPoolingMethod_t pool_method; // = LIBDNN_POOLING_METHOD_MAX;
     bool global_pooling; // = false;
     bool avePoolPaddedArea;
+    bool use_half;
 };
 
 template<typename Dtype>
@@ -391,13 +398,14 @@ class OCL4DNNPool
         int32_t pooled_height_;
         int32_t pooled_width_;
         bool avePoolPaddedArea;
+        bool use_half;
 };
 
 struct OCL4DNNInnerProductConfig
 {
     OCL4DNNInnerProductConfig() :
         num_output(0), M(0), K(0),
-        bias_term(false), transpose(false), phase_test(true)
+        bias_term(false), transpose(false), phase_test(true), use_half(false)
     {}
     int num_output;
     int M;
@@ -405,6 +413,7 @@ struct OCL4DNNInnerProductConfig
     bool bias_term;
     bool transpose; // = false;
     bool phase_test; // = true;
+    bool use_half; // = false;
 };
 
 template<typename Dtype>
@@ -428,6 +437,7 @@ class OCL4DNNInnerProduct
         bool transpose_;
         bool image_copied_;
         bool phase_test_;
+        bool use_half_;
 };
 
 typedef enum {
@@ -441,7 +451,7 @@ struct OCL4DNNLRNConfig
         lrn_type(LRNParameter_NormRegion_ACROSS_CHANNELS),
         phase_test(true),
         local_size(0), alpha(0.f), beta(0.f), k(0.f), norm_by_size(false),
-        batch_size(0), channels(0), height(0), width(0)
+        batch_size(0), channels(0), height(0), width(0), use_half(false)
     {}
     MatShape in_shape;
     LRNParameter_NormRegion_WITHIN_CHANNEL_t lrn_type;
@@ -455,6 +465,7 @@ struct OCL4DNNLRNConfig
     int32_t channels;
     int32_t height;
     int32_t width;
+    bool use_half;
 };
 
 template<typename Dtype>
@@ -477,16 +488,18 @@ class OCL4DNNLRN
         int32_t height_;
         int32_t width_;
         bool norm_by_size_;
+        bool use_half_;
 };
 
 struct OCL4DNNSoftmaxConfig
 {
-    OCL4DNNSoftmaxConfig() : axis(0), channels(0), logsoftmax(false)
+    OCL4DNNSoftmaxConfig() : axis(0), channels(0), logsoftmax(false), use_half(false)
     {}
     MatShape in_shape;
     int axis;
     int channels;
     bool logsoftmax;
+    bool use_half;
 };
 
 template<typename Dtype>
@@ -506,6 +519,7 @@ class OCL4DNNSoftmax
         bool use_slm_;
         bool log_softmax_;
         UMat scale_data_;
+        bool use_half_;
 };
 
 }}} // namespace cv::dnn::ocl4dnn
diff --git a/modules/dnn/src/ocl4dnn/src/math_functions.cpp b/modules/dnn/src/ocl4dnn/src/math_functions.cpp
index 3f4a70bc03..b2dda73f75 100644
--- a/modules/dnn/src/ocl4dnn/src/math_functions.cpp
+++ b/modules/dnn/src/ocl4dnn/src/math_functions.cpp
@@ -48,6 +48,12 @@
 
 namespace cv { namespace dnn { namespace ocl4dnn {
 
+enum gemm_data_type_t
+{
+    TYPE_FLOAT = 1,
+    TYPE_HALF = 2
+};
+
 // Create and copy buffer to image for GEMM's matrix A and B.
 // Will return image to caller if the input image is NULL. Otherwise,
 // will use the image directly. It's caller's responsibility to
@@ -60,6 +66,7 @@ ocl::Image2D ocl4dnnGEMMCopyBufferToImage(UMat buffer, int offset,
                                           int width, int ld)
 {
     ocl::Image2D image;
+    String opts = format("-DTYPE=%d", TYPE_FLOAT);
 
     if (!is_matrix_a && transpose)
     {
@@ -73,7 +80,8 @@ ocl::Image2D ocl4dnnGEMMCopyBufferToImage(UMat buffer, int offset,
             UMat mat(height, width, CV_32FC1);
             image = ocl::Image2D(mat);
 
-            ocl::Kernel oclk_gemm_copy("gemm_buffer_copy_image_transpose_float", ocl::dnn::gemm_image_oclsrc);
+            ocl::Kernel oclk_gemm_copy("gemm_buffer_copy_image_transpose_float",
+                                       ocl::dnn::gemm_image_oclsrc, opts);
 
             size_t global_copy[2];
             global_copy[0] = width;
@@ -96,7 +104,7 @@ ocl::Image2D ocl4dnnGEMMCopyBufferToImage(UMat buffer, int offset,
             image = ocl::Image2D(mat);
 
             ocl::Kernel oclk_gemm_copy("gemm_buffer_copy_image_no_transpose_float",
-                                       ocl::dnn::gemm_image_oclsrc);
+                                       ocl::dnn::gemm_image_oclsrc, opts);
 
             size_t global_copy[2];
             global_copy[0] = padded_width;
@@ -129,7 +137,7 @@ enum gemm_type_t
     GEMM_TYPE_FAST_IMAGE_32_1,
     GEMM_TYPE_FAST_IMAGE_32_2,
     GEMM_TYPE_FAST_IMAGE_B_IMAGE,
-    GEMM_TYPE_MAX
+    GEMM_TYPE_FAST_BUFFER
 };
 
 template<typename Dtype>
@@ -145,6 +153,8 @@ static bool ocl4dnnFastImageGEMM(const CBLAS_TRANSPOSE TransA,
     CHECK_EQ(gemm_type == GEMM_TYPE_FAST_IMAGE_32_1 || gemm_type == GEMM_TYPE_FAST_IMAGE_32_2 ||
              gemm_type == GEMM_TYPE_FAST_IMAGE_B_IMAGE, true) << "Invalid fast image gemm type." << std::endl;
 
+    bool halfPrecisionMode = (A.depth() == CV_16S);
+
     if (is_image_a)
     {
         CHECK_EQ(offA, 0) << "Invalid input image offset." << std::endl;
@@ -157,6 +167,7 @@ static bool ocl4dnnFastImageGEMM(const CBLAS_TRANSPOSE TransA,
         return false;
     }
 
+    String opts = format("-DTYPE=%d", halfPrecisionMode ? TYPE_HALF : TYPE_FLOAT);
     int widthA = (TransA == CblasNoTrans) ? K : M;
     int heightA = (TransA == CblasNoTrans) ? M : K;
     int widthB = (TransB == CblasNoTrans) ? N : K;
@@ -178,7 +189,7 @@ static bool ocl4dnnFastImageGEMM(const CBLAS_TRANSPOSE TransA,
     int blockC_width = blocksize;
     int blockC_height = blocksize;
 
-    int use_buffer_indicator = 8;
+    int use_buffer_indicator = (halfPrecisionMode) ? 16 : 8;
     // To fix the edge problem caused by the sub group block read.
     // we have to pad the image if it's not multiple of tile.
     // just padding one line is enough as the sub group block read
@@ -221,9 +232,13 @@ static bool ocl4dnnFastImageGEMM(const CBLAS_TRANSPOSE TransA,
     else
         kernel_name += "1";
 
-    kernel_name += "_float";
+    if (halfPrecisionMode) {
+        kernel_name += "_half";
+    } else {
+        kernel_name += "_float";
+    }
 
-    ocl::Kernel oclk_gemm_float(kernel_name.c_str(), ocl::dnn::gemm_image_oclsrc);
+    ocl::Kernel oclk_gemm_float(kernel_name.c_str(), ocl::dnn::gemm_image_oclsrc, opts);
     if (oclk_gemm_float.empty())
         return false;
 
@@ -255,6 +270,10 @@ static bool ocl4dnnFastImageGEMM(const CBLAS_TRANSPOSE TransA,
                 bool padding_A = false;
                 bool padding_B = false;
 
+                if (halfPrecisionMode && is_image_b) {
+                    padding_A = true;
+                }
+
                 if (!is_image_a && !is_image_b)
                 {
                     if (M * K < N * K)
@@ -265,17 +284,19 @@ static bool ocl4dnnFastImageGEMM(const CBLAS_TRANSPOSE TransA,
 
                 if (!is_image_a)
                 {
-                    ImA = ocl4dnnGEMMCopyBufferToImage<Dtype>(A, blockA_offset,
-                                                              true, TransA != CblasNoTrans,
-                                                              padding_A, imageA_h, imageA_w,
-                                                              blockA_height, blockA_width, ldA);
+                    if (!halfPrecisionMode)
+                        ImA = ocl4dnnGEMMCopyBufferToImage<Dtype>(A, blockA_offset,
+                                                                  true, TransA != CblasNoTrans,
+                                                                  padding_A, imageA_h, imageA_w,
+                                                                  blockA_height, blockA_width, ldA);
                 }
                 if (!is_image_b)
                 {
-                    ImB = ocl4dnnGEMMCopyBufferToImage<Dtype>(B, blockB_offset,
-                                                              false, false,
-                                                              padding_B, imageB_h, imageB_w,
-                                                              blockB_height, blockB_width, ldB);
+                    if (!halfPrecisionMode)
+                        ImB = ocl4dnnGEMMCopyBufferToImage<Dtype>(B, blockB_offset,
+                                                                  false, false,
+                                                                  padding_B, imageB_h, imageB_w,
+                                                                  blockB_height, blockB_width, ldB);
                 }
             } else {
                 // We will use normal read_imagef to read image B when B has transpose.
@@ -283,32 +304,48 @@ static bool ocl4dnnFastImageGEMM(const CBLAS_TRANSPOSE TransA,
                 if (!is_image_a)
                 {
                     bool padding;
-                    padding = !is_image_b;
-                    ImA = ocl4dnnGEMMCopyBufferToImage<Dtype>(A, blockA_offset,
-                                                              true, TransA != CblasNoTrans,
-                                                              padding, imageA_h, imageA_w,
-                                                              blockA_height, blockA_width, ldA);
+                    padding = !is_image_b || halfPrecisionMode;
+                    if (!halfPrecisionMode)
+                        ImA = ocl4dnnGEMMCopyBufferToImage<Dtype>(A, blockA_offset,
+                                                                  true, TransA != CblasNoTrans,
+                                                                  padding, imageA_h, imageA_w,
+                                                                  blockA_height, blockA_width, ldA);
                 }
 
                 if (!is_image_b && (K % use_buffer_indicator != 0))
                 {
-                    ImB = ocl4dnnGEMMCopyBufferToImage<Dtype>(B, blockB_offset,
-                                                              false, true, false, imageB_h, imageB_w,
-                                                              blockB_height, blockB_width, ldB);
+                    if (!halfPrecisionMode)
+                        ImB = ocl4dnnGEMMCopyBufferToImage<Dtype>(B, blockB_offset,
+                                                                  false, true, false,
+                                                                  imageB_h, imageB_w,
+                                                                  blockB_height, blockB_width, ldB);
                 }
             }
 
             size_t global[2];
             if (gemm_type == GEMM_TYPE_FAST_IMAGE_32_1 || gemm_type == GEMM_TYPE_FAST_IMAGE_B_IMAGE)
             {
-                global[0] = (size_t)( blockC_width + 7 ) & ~7;
+                if (halfPrecisionMode) {
+                    global[0] = (size_t)( blockC_width + 15 ) & ~15;
+                } else {
+                    global[0] = (size_t)( blockC_width + 7 ) & ~7;
+                }
             } else {
-                global[0] = (size_t)( (blockC_width / 2 ) + 7 ) ^ ~7;
+                if (halfPrecisionMode) {
+                    global[0] = (size_t)( (blockC_width / 2 ) + 15 ) ^ ~15;
+                } else {
+                    global[0] = (size_t)( (blockC_width / 2 ) + 7 ) ^ ~7;
+                }
             }
             global[1] = (size_t)(blockC_height + 31) / 32;
 
             size_t local[2];
-            local[0] = 8;
+            if (halfPrecisionMode)
+            {
+                local[0] = 16;
+            } else {
+                local[0] = 8;
+            }
             local[1] = 1;
 
             cl_uint arg_idx = 0;
@@ -385,6 +422,101 @@ static bool ocl4dnnFastImageGEMM(const CBLAS_TRANSPOSE TransA,
     return true;
 }
 
+template<typename Dtype>
+static bool ocl4dnnFastBufferGEMM(const CBLAS_TRANSPOSE TransA,
+                                  const CBLAS_TRANSPOSE TransB, const int32_t M,
+                                  const int32_t N, const int32_t K, const Dtype alpha,
+                                  const UMat A, const int32_t offA, const UMat B,
+                                  const int32_t offB, const Dtype beta, UMat C,
+                                  const int32_t offC, enum gemm_type_t gemm_type)
+{
+    CHECK_EQ(gemm_type == GEMM_TYPE_FAST_BUFFER, true)
+             << "Invalid fast buffer gemm type." << std::endl;
+
+    bool halfPrecisionMode = (A.depth() == CV_16S);
+
+    size_t sub_group_size = 8;
+    bool is_small_batch = (M == 2 || M == 4 || M == 8);
+    String kernel_name("gemm_buffer_");
+    if (TransA == CblasNoTrans && TransB == CblasNoTrans) {
+        kernel_name += "NN";
+        if (halfPrecisionMode) {
+            sub_group_size = 16;
+        }
+    } else if (TransA == CblasNoTrans && TransB != CblasNoTrans) {
+        if (M == 2)
+            kernel_name +="NT_M_2";
+        else if (M == 4)
+            kernel_name +="NT_M_4";
+        else if (M == 8)
+            kernel_name +="NT_M_8";
+        else
+            kernel_name += "NT";
+    }
+
+    if (halfPrecisionMode) {
+        kernel_name += "_half";
+    } else {
+        kernel_name += "_float";
+    }
+
+    String opts = format("-DTYPE=%d", halfPrecisionMode ? TYPE_HALF : TYPE_FLOAT);
+    ocl::Kernel oclk_gemm_float(kernel_name.c_str(), ocl::dnn::gemm_buffer_oclsrc, opts);
+    size_t local[2] = {};
+    size_t global[2] = {};
+    if (TransA == CblasNoTrans && TransB != CblasNoTrans && is_small_batch) {
+        if (M == 8)
+            local[0] = 16;
+        else if (M == 4)
+            local[0] = 32;
+        else
+            local[0] = 64;
+        local[1] = 1;
+
+        if (M == 8)
+            global[0] = N * local[0];
+        else
+            global[0] = (N + 3) / 4 * local[0];
+        global[1] = 1;
+    } else {
+        size_t lx = sub_group_size;
+        size_t ly = (TransB != CblasNoTrans && TransA == CblasNoTrans && halfPrecisionMode) ? 2 : 4;
+        int dx = (TransB != CblasNoTrans && TransA == CblasNoTrans) ? 1 : 4;
+        int dy = 8;
+        size_t gx = (size_t)(N + dx - 1) / dx;
+        size_t gy = (size_t)(M + dy - 1) / dy;
+        global[0] = (gx + lx - 1) / lx * lx;
+        global[1] = (gy + ly - 1) / ly * ly;
+        local[0] = lx;
+        local[1] = ly;
+    }
+
+    int arg_idx = 0;
+    oclk_gemm_float.set(arg_idx++, ocl::KernelArg::PtrReadOnly(A));
+    oclk_gemm_float.set(arg_idx++, offA);
+    oclk_gemm_float.set(arg_idx++, ocl::KernelArg::PtrReadOnly(B));
+    oclk_gemm_float.set(arg_idx++, offB);
+    oclk_gemm_float.set(arg_idx++, ocl::KernelArg::PtrWriteOnly(C));
+    oclk_gemm_float.set(arg_idx++, offC);
+    oclk_gemm_float.set(arg_idx++, M);
+    oclk_gemm_float.set(arg_idx++, N);
+    oclk_gemm_float.set(arg_idx++, K);
+    oclk_gemm_float.set(arg_idx++, (float)alpha);
+    oclk_gemm_float.set(arg_idx++, (float)beta);
+
+    bool ret;
+    if (TransB == CblasNoTrans || TransA != CblasNoTrans) {
+        int stride = 256;
+        for (int start_index = 0; start_index < K; start_index += stride) {
+            oclk_gemm_float.set(arg_idx, start_index);
+            ret = oclk_gemm_float.run(2, global, local, false);
+        }
+    } else {
+        ret = oclk_gemm_float.run(2, global, local, false);
+    }
+    return ret;
+}
+
 template<typename Dtype>
 bool ocl4dnnGEMMCommon(const CBLAS_TRANSPOSE TransB,
                        const int32_t M, const int32_t N, const int32_t K,
@@ -392,7 +524,8 @@ bool ocl4dnnGEMMCommon(const CBLAS_TRANSPOSE TransB,
                        const UMat B_image, UMat C,
                        const size_t max_image_size)
 {
-    gemm_type_t gemm_type = GEMM_TYPE_FAST_IMAGE_32_1;
+    bool halfPrecisionMode = (A.depth() == CV_16S);
+    gemm_type_t gemm_type = halfPrecisionMode ? GEMM_TYPE_FAST_BUFFER : GEMM_TYPE_FAST_IMAGE_32_1;
 
     if (gemm_type == GEMM_TYPE_FAST_IMAGE_32_1 ||
         gemm_type == GEMM_TYPE_FAST_IMAGE_32_2)
@@ -409,6 +542,11 @@ bool ocl4dnnGEMMCommon(const CBLAS_TRANSPOSE TransB,
                                            GEMM_TYPE_FAST_IMAGE_B_IMAGE,
                                            max_image_size);
     }
+    else if (gemm_type == GEMM_TYPE_FAST_BUFFER)
+    {
+        return ocl4dnnFastBufferGEMM<Dtype>(CblasNoTrans, TransB, M, N, K,
+                                            1.f, A, 0, B, 0, 0.f, C, 0, gemm_type);
+    }
     return false;
 }
 
@@ -436,10 +574,17 @@ bool ocl4dnnGEMV<float>(const CBLAS_TRANSPOSE TransA,
                  const int32_t offy)
 {
     bool ret = false;
+    bool use_half = (A.depth() == CV_16S);
+    String opts;
+    if (use_half)
+        opts = format("-DDtype=%s -DDtype4=%s -Dconvert_Dtype=convert_%s", "half", "half4", "half");
+    else
+        opts = format("-DDtype=%s -DDtype4=%s -Dconvert_Dtype=convert_%s", "float", "float4", "float");
 
     if (TransA == CblasNoTrans)
     {
-        ocl::Kernel k(CL_KERNEL_SELECT("matvec_mul4"), cv::ocl::dnn::matvec_mul_oclsrc);
+        String kname = format("matvec_mul4_%s", use_half ? "half" : "float");
+        ocl::Kernel k(kname.c_str(), cv::ocl::dnn::matvec_mul_oclsrc, opts);
         if (k.empty())
             return false;
 
@@ -469,7 +614,8 @@ bool ocl4dnnGEMV<float>(const CBLAS_TRANSPOSE TransA,
 
         if ((row_size % 4) != 0 && ret)
         {
-            ocl::Kernel k_1(CL_KERNEL_SELECT("matvec_mul1"), cv::ocl::dnn::matvec_mul_oclsrc);
+            String kname = format("matvec_mul1_%s", use_half ? "half" : "float");
+            ocl::Kernel k_1(kname.c_str(), cv::ocl::dnn::matvec_mul_oclsrc, opts);
             size_t localsize[] = { 128 };
             size_t globalsize[] = { row_size % 4 * localsize[0] };
             uint row_offset = row_size - (row_size % 4);
@@ -499,7 +645,15 @@ bool ocl4dnnAXPY(const int32_t N, const Dtype alpha,
                  const UMat X, const int32_t offX, UMat Y,
                  const int32_t offY)
 {
-    ocl::Kernel oclk_axpy(CL_KERNEL_SELECT("axpy"), cv::ocl::dnn::math_oclsrc);
+    bool use_half = (X.depth() == CV_16S);
+    String opts;
+    if (use_half)
+        opts = "-DDtype=half -DDtype4=half4 -Dconvert_Dtype=convert_half";
+    else
+        opts = "-DDtype=float -DDtype4=float4 -Dconvert_Dtype=convert_float";
+
+    String kname = format("axpy_%s", use_half ? "half" : "float");
+    ocl::Kernel oclk_axpy(kname.c_str(), cv::ocl::dnn::math_oclsrc, opts);
     if (oclk_axpy.empty())
         return false;
 
diff --git a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
index 85432293a4..44a622f1d4 100644
--- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
+++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
@@ -54,6 +54,7 @@
 #include "opencl_kernels_dnn.hpp"
 #include "../include/math_functions.hpp"
 #include "../include/default_kernel_config.hpp"
+#include "opencv2/dnn/shape_utils.hpp"
 
 #if defined WIN32 || defined _WIN32
 #include <windows.h>
@@ -85,6 +86,7 @@ OCL4DNNConvSpatial<Dtype>::OCL4DNNConvSpatial(OCL4DNNConvConfig config)
     max_value_ = 0;
     prev_kernel_type_ = -1;
     tuned_ = false;
+    use_half_ = config.use_half;
 
     // assumption: spatial dimension is 2.
     kernel_h_ = config.kernel.height;
@@ -204,18 +206,40 @@ void OCL4DNNConvSpatial<Dtype>::setFusionArg(ocl4dnnFusedActiv_t fused_activ, bo
     return;
 }
 
+typedef enum {
+    TYPE_FLOAT = 1,
+    TYPE_HALF = 2
+} ocl4dnnConvSpatialType_t;
+
 template<typename Dtype>
 void OCL4DNNConvSpatial<Dtype>::collectCommonInformation()
 {
-    addDef("Dtype", "float");
-    addDef("Dtype2", "float2");
-    addDef("Dtype4", "float4");
-    addDef("Dtype8", "float8");
-    addDef("Dtype16", "float16");
-    addDef("as_Dtype", "as_float");
-    addDef("as_Dtype2", "as_float2");
-    addDef("as_Dtype4", "as_float4");
-    addDef("as_Dtype8", "as_float8");
+    if (use_half_)
+    {
+        addDef("TYPE", TYPE_HALF);
+        addDef("Dtype", "half");
+        addDef("Dtype2", "half2");
+        addDef("Dtype4", "half4");
+        addDef("Dtype8", "half8");
+        addDef("Dtype16", "half16");
+        addDef("as_Dtype", "as_half");
+        addDef("as_Dtype2", "as_half2");
+        addDef("as_Dtype4", "as_half4");
+        addDef("as_Dtype8", "as_half8");
+    }
+    else
+    {
+        addDef("TYPE", TYPE_FLOAT);
+        addDef("Dtype", "float");
+        addDef("Dtype2", "float2");
+        addDef("Dtype4", "float4");
+        addDef("Dtype8", "float8");
+        addDef("Dtype16", "float16");
+        addDef("as_Dtype", "as_float");
+        addDef("as_Dtype2", "as_float2");
+        addDef("as_Dtype4", "as_float4");
+        addDef("as_Dtype8", "as_float8");
+    }
 }
 
 typedef enum {
@@ -477,10 +501,16 @@ bool OCL4DNNConvSpatial<Dtype>::Forward(const UMat& bottom,
         fused_eltwise_ = false;
     }
 
-    prepareKernel(bottom, top, weight, bias, numImages);
+    if (use_half_ && bias_half.empty() && !bias.empty())
+        convertFp16((UMat&)bias, bias_half);
+
+    if (use_half_ && weights_half.empty())
+        convertFp16((UMat&)weight, weights_half);
+
+    prepareKernel(bottom, top, weight, (use_half_) ? bias_half : bias, numImages);
     if (bestKernelConfig.empty())
         return false;
-    return convolve(bottom, top, weight, bias, numImages, bestKernelConfig);
+    return convolve(bottom, top, weight, (use_half_) ? bias_half : bias, numImages, bestKernelConfig);
 }
 
 template<typename Dtype>
@@ -556,6 +586,12 @@ std::string OCL4DNNConvSpatial<Dtype>::generateSpecificKey(int32_t type, int32_t
                << "_" << blockWidth
                << "_" << blockHeight
                << "_" << blockDepth;
+
+    if (!use_half_)
+        keyBuilder << "_float";
+    else
+        keyBuilder << "_half";
+
     return keyBuilder.str();
 }
 
@@ -637,9 +673,13 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
 
     if (swizzled_weights_umat.empty())
         swizzled_weights_umat.create(1, (int)alignSize(num_output_, 16) * channels_ *
-                                     kernel_h_ * (int)alignSize(kernel_w_, 2), CV_32FC1);
+                                     kernel_h_ * (int)alignSize(kernel_w_, 2),
+                                     (use_half_) ? CV_16SC1 : CV_32FC1);
+
+    UMat swizzled_weights_tmp;
+    if (use_half_)
+        swizzled_weights_tmp.create(shape(swizzled_weights_umat), CV_32F);
 
-    ocl::Queue queue = ocl::Queue::getDefault();
     if (!interleave) {
         cl_uint argIdx = 0;
         int32_t channels = channels_ / group_;
@@ -650,7 +690,10 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
             return false;
 
         oclk_copy_weight.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
-        oclk_copy_weight.set(argIdx++, ocl::KernelArg::PtrWriteOnly(swizzled_weights_umat));
+        if (use_half_)
+            oclk_copy_weight.set(argIdx++, ocl::KernelArg::PtrWriteOnly(swizzled_weights_tmp));
+        else
+            oclk_copy_weight.set(argIdx++, ocl::KernelArg::PtrWriteOnly(swizzled_weights_umat));
         oclk_copy_weight.set(argIdx++, kernel_w_);
         oclk_copy_weight.set(argIdx++, kernel_h_);
         oclk_copy_weight.set(argIdx++, channels);
@@ -669,7 +712,11 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
         // assumption: kernel dimesion is 2
         Mat weightMat = weight.getMat(ACCESS_READ);
         Dtype* cpu_weight = (Dtype *)weightMat.ptr<float>();
-        Mat swizzledWeightMat = swizzled_weights_umat.getMat(ACCESS_WRITE);
+        Mat swizzledWeightMat;
+        if (use_half_)
+            swizzledWeightMat = swizzled_weights_tmp.getMat(ACCESS_WRITE);
+        else
+            swizzledWeightMat = swizzled_weights_umat.getMat(ACCESS_WRITE);
         Dtype* cpu_swizzled_weight = (Dtype *)swizzledWeightMat.ptr<float>();
 
         int interleavedRows = (kernel_w_ / 2) * 2;
@@ -694,6 +741,10 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
                          rowAlignment);
         free(tmpSwizzledWeight);
     }
+
+    if (use_half_)
+        convertFp16(swizzled_weights_tmp, swizzled_weights_umat);
+
     return true;
 }
 
@@ -727,9 +778,10 @@ void OCL4DNNConvSpatial<float>::CreateSubBuffer(const UMat& buffer, UMat& sub_bu
     cl_mem sub_mem;
     cl_buffer_region region;
     cl_int err;
+    size_t element_size = (use_half_) ? sizeof(short) : sizeof(float);
 
-    region.origin = offset * sizeof(float);
-    region.size = size * sizeof(float);
+    region.origin = offset * element_size;
+    region.size = size * element_size;
     sub_mem = clCreateSubBuffer((cl_mem)buffer.handle(ACCESS_READ),
                                 write_only ? CL_MEM_WRITE_ONLY : CL_MEM_READ_ONLY,
                                 CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
@@ -739,8 +791,9 @@ void OCL4DNNConvSpatial<float>::CreateSubBuffer(const UMat& buffer, UMat& sub_bu
         return;
     }
 
-    int step = sizeof(float), rows = size, cols = 1;
-    ocl::convertFromBuffer(sub_mem, step, rows, cols, CV_32FC1, sub_buffer);
+    int step = element_size, rows = size, cols = 1;
+    ocl::convertFromBuffer(sub_mem, step, rows, cols,
+                           (use_half_) ? CV_16SC1 : CV_32FC1, sub_buffer);
 
     //decrease ocl mem refcount
     clReleaseMemObject(sub_mem);
@@ -978,7 +1031,10 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
         cl_uint argIdx = 0;
         setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
         kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
-        kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
+        if (use_half_)
+            kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weights_half));
+        else
+            kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
         if (bias_term_)
             kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
         kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
@@ -1018,7 +1074,10 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
                 setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
                 kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
                 kernel.set(argIdx++, image_offset);
-                kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
+                if (use_half_)
+                    kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weights_half));
+                else
+                    kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
                 kernel.set(argIdx++, kernel_offset);
                 if (bias_term_)
                     kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
@@ -1132,14 +1191,27 @@ bool OCL4DNNConvSpatial<float>::verifyResult(const UMat &bottom,
         return false;
 
     int32_t sz[4] = {numImages, num_output_, output_h_, output_w_};
-    top.zeros(4, sz, CV_32FC1);
+    top.zeros(4, sz, (use_half_) ? CV_16SC1 : CV_32FC1);
     bool saved_tuned = tuned_;
     tuned_ = false;
     convolve(bottom, top, weight, bias, numImages, config);
     tuned_ = saved_tuned;
 
-    float *data = (float *)top.getMat(ACCESS_READ).ptr<float>();
-    float *verify_data = (float *)verifyTop.getMat(ACCESS_READ).ptr<float>();
+    UMat new_top, new_verify_top;
+    float *data, *verify_data;
+    if (use_half_)
+    {
+        convertFp16(top, new_top);
+        convertFp16(verifyTop, new_verify_top);
+
+        data = (float *)new_top.getMat(ACCESS_READ).ptr<float>();
+        verify_data = (float *)new_verify_top.getMat(ACCESS_READ).ptr<float>();
+    }
+    else
+    {
+        data = (float *)top.getMat(ACCESS_READ).ptr<float>();
+        verify_data = (float *)verifyTop.getMat(ACCESS_READ).ptr<float>();
+    }
 
     for (int32_t n = 0; n < num_; ++n) {
         for (int32_t g = 0; g < group_; ++g) {
@@ -1148,9 +1220,19 @@ bool OCL4DNNConvSpatial<float>::verifyResult(const UMat &bottom,
                 for (int h = 0; h < output_h_ && !verificationFail; h++)
                     for (int w = 0; w < output_w_; w++) {
                         size_t offset = output_image_offset + out_ch * output_w_ * output_h_ + h * output_w_ + w;
-                        if (fabs(data[offset] - verify_data[offset]) > 0.1 * fabs(verify_data[offset]) &&
-                            !(fabs(verify_data[offset]) < 1.e-3 &&
-                            fabs(data[offset] - verify_data[offset]) < 1.e-4))
+
+                        float error_factor = fabs(data[offset] - verify_data[offset]);
+                        if (use_half_ && error_factor > 0.1 * fabs(verify_data[offset]) &&
+                            error_factor > 0.04 && !(fabs(verify_data[offset]) < 1.e-3 && error_factor < 1.e-4))
+                        {
+                            dbgPrint(printf("test verification failed @ image %d group %d"
+                                            "out_ch %d h %d w %d got %G expected %G\n",
+                                            n, g, out_ch, h, w, data[offset], verify_data[offset]));
+                            verificationFail = 1;
+                            goto out;
+                        }
+                        else if (!use_half_ && error_factor > 0.1 * fabs(verify_data[offset]) &&
+                                 !(fabs(verify_data[offset]) < 1.e-3 && error_factor < 1.e-4))
                         {
                             dbgPrint(printf("test verification failed @ image %d group %d"
                                             "out_ch %d h %d w %d got %G expected %G\n",
@@ -1719,15 +1801,16 @@ void OCL4DNNConvSpatial<Dtype>::prepareKernel(const UMat &bottom, UMat &top,
     if (loadTunedConfig()) // check external storage
         return;
 
-    UMat benchData(1, numImages * top_dim_, CV_32FC1);
+    UMat benchData(1, numImages * top_dim_, (use_half_) ? CV_16SC1 : CV_32FC1);
+
+    calculateBenchmark(bottom, benchData, (use_half_) ? weights_half : weight, bias, numImages);
+
     if (force_auto_tuning_)
     {
-        calculateBenchmark(bottom, benchData, weight, bias, numImages);
         setupConvolution(bottom, top, weight, bias, numImages, benchData);
     }
     else
     {
-        calculateBenchmark(bottom, benchData, weight, bias, numImages);
         useFirstAvailable(bottom, top, weight, bias, numImages, benchData);
     }
     cacheTunedConfig();
diff --git a/modules/dnn/src/ocl4dnn/src/ocl4dnn_inner_product.cpp b/modules/dnn/src/ocl4dnn/src/ocl4dnn_inner_product.cpp
index aabee57984..ee7a2c7b01 100644
--- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_inner_product.cpp
+++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_inner_product.cpp
@@ -56,6 +56,7 @@ OCL4DNNInnerProduct<Dtype>::OCL4DNNInnerProduct(OCL4DNNInnerProductConfig config
     K_ = config.K;
     phase_test_ = config.phase_test;
     image_copied_ = false;
+    use_half_ = config.use_half;
 }
 
 template<typename Dtype>
@@ -89,13 +90,24 @@ bool OCL4DNNInnerProduct<Dtype>::Forward(const UMat& bottom,
         if (M_ <= max_image_size &&
             N_ <= max_image_size &&
             K_ <= max_image_size &&
-            cv::traits::Depth<Dtype>::value == CV_32F &&
             ocl::Device::getDefault().intelSubgroupsSupport())
         {
             ret = ocl4dnnGEMMCommon<Dtype>(transpose_ ? CblasNoTrans : CblasTrans,
                                            M_, N_, K_, bottom, weight, UMat(), top,
                                            max_image_size);
         }
+
+        if (use_half_ && bias_term_)
+        {
+            UMat biasOneMat = UMat::ones(M_, 1, CV_32F);
+            UMat newbias, tmpTop;
+
+            convertFp16(bias, newbias);
+            convertFp16(top, tmpTop);
+            cv::gemm(biasOneMat, newbias, 1, tmpTop, 1, tmpTop, 0);
+            convertFp16(tmpTop, top);
+        }
+
         return ret;
     }
 }
diff --git a/modules/dnn/src/ocl4dnn/src/ocl4dnn_lrn.cpp b/modules/dnn/src/ocl4dnn/src/ocl4dnn_lrn.cpp
index c7062f4090..b0fcfa9f0b 100644
--- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_lrn.cpp
+++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_lrn.cpp
@@ -61,6 +61,7 @@ OCL4DNNLRN<Dtype>::OCL4DNNLRN(OCL4DNNLRNConfig config)
     channels_ = config.channels;
     height_ = config.height;
     width_ = config.width;
+    use_half_ = config.use_half;
 }
 
 template<typename Dtype>
@@ -97,8 +98,10 @@ bool OCL4DNNLRN<Dtype>::crossChannelForward(const UMat& bottom, UMat& top)
     int32_t n_threads = num_ * height_ * width_;
     size_t global_work_size_[1] = {(size_t)n_threads};
     String opts = clOptionSupport("-cl-no-subgroup-ifp") ? " -cl-no-subgroup-ifp " : "";
+    opts += format("-D Dtype=%s", (use_half_) ? "half" : "float");
     ocl::Kernel oclk_lrn_fill;
-    if (!oclk_lrn_fill.create(CL_KERNEL_SELECT("lrn_full_no_scale"), ocl::dnn::ocl4dnn_lrn_oclsrc, opts))
+    String kname = format("lrn_full_no_scale_%s", (use_half_) ? "half" : "float");
+    if (!oclk_lrn_fill.create(kname.c_str(), ocl::dnn::ocl4dnn_lrn_oclsrc, opts))
         return false;
 
     oclk_lrn_fill.set(argIdx++, n_threads);
diff --git a/modules/dnn/src/ocl4dnn/src/ocl4dnn_pool.cpp b/modules/dnn/src/ocl4dnn/src/ocl4dnn_pool.cpp
index 2d9c4dcf77..81238e9f3e 100644
--- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_pool.cpp
+++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_pool.cpp
@@ -56,6 +56,7 @@ OCL4DNNPool<Dtype>::OCL4DNNPool(OCL4DNNPoolConfig config)
     channels_ = config.channels;
     pool_method_ = config.pool_method;
     avePoolPaddedArea = config.avePoolPaddedArea;
+    use_half = config.use_half;
 
     for (int i = 0; i < spatial_dims; ++i)
     {
@@ -105,12 +106,15 @@ bool OCL4DNNPool<Dtype>::Forward(const UMat& bottom,
     case LIBDNN_POOLING_METHOD_MAX:
         {
             bool haveMask = !top_mask.empty();
+            String kname = haveMask ? "max_pool_forward_mask" : "max_pool_forward";
+            kname += (use_half) ? "_half" : "_float";
             ocl::Kernel oclk_max_pool_forward(
-                haveMask ? CL_KERNEL_SELECT("max_pool_forward_mask") : CL_KERNEL_SELECT("max_pool_forward"),
+                kname.c_str(),
                 ocl::dnn::ocl4dnn_pooling_oclsrc,
-                format("-D KERNEL_MAX_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d"
+                format(" -D Dtype=%s -D KERNEL_MAX_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d"
                        " -D STRIDE_W=%d -D STRIDE_H=%d"
                        " -D PAD_W=%d -D PAD_H=%d%s",
+                       (use_half) ? "half" : "float",
                        kernel_w_, kernel_h_,
                        stride_w_, stride_h_,
                        pad_w_, pad_h_,
@@ -139,11 +143,14 @@ bool OCL4DNNPool<Dtype>::Forward(const UMat& bottom,
         {
             CV_Assert(top_mask.empty());
 
-            ocl::Kernel oclk_ave_pool_forward(CL_KERNEL_SELECT("ave_pool_forward"),
+            String kname = format("ave_pool_forward_%s", (use_half) ? "half" : "float");
+            ocl::Kernel oclk_ave_pool_forward(
+                kname.c_str(),
                 ocl::dnn::ocl4dnn_pooling_oclsrc,
-                format("-D KERNEL_AVE_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d"
+                format(" -D Dtype=%s -D KERNEL_AVE_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d"
                        " -D STRIDE_W=%d -D STRIDE_H=%d"
                        " -D PAD_W=%d -D PAD_H=%d%s",
+                       (use_half) ? "half" : "float",
                        kernel_w_, kernel_h_,
                        stride_w_, stride_h_,
                        pad_w_, pad_h_,
@@ -171,7 +178,9 @@ bool OCL4DNNPool<Dtype>::Forward(const UMat& bottom,
         {
             CV_Assert(top_mask.empty());
 
-            ocl::Kernel oclk_sto_pool_forward(CL_KERNEL_SELECT("sto_pool_forward_test"),
+            String kname = format("sto_pool_forward_test_%s", (use_half) ? "half" : "float");
+            ocl::Kernel oclk_sto_pool_forward(
+                kname.c_str(),
                 ocl::dnn::ocl4dnn_pooling_oclsrc,
                 format("-D KERNEL_STO_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d"
                        " -D STRIDE_W=%d -D STRIDE_H=%d",
diff --git a/modules/dnn/src/ocl4dnn/src/ocl4dnn_softmax.cpp b/modules/dnn/src/ocl4dnn/src/ocl4dnn_softmax.cpp
index 6b957649e8..78576711a7 100644
--- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_softmax.cpp
+++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_softmax.cpp
@@ -52,6 +52,7 @@ OCL4DNNSoftmax<Dtype>::OCL4DNNSoftmax(OCL4DNNSoftmaxConfig config)
     softmax_axis_ = config.axis;
     channels_ = config.channels;
     log_softmax_ = config.logsoftmax;
+    use_half_ = config.use_half;
 
     inner_num_ = 1;
     outer_num_ = 1;
@@ -91,10 +92,13 @@ bool OCL4DNNSoftmax<Dtype>::Forward(const UMat& bottom, UMat& top)
 
         if (log_softmax_) opts += " -DLOG_SOFTMAX ";
         if (use_slm_)
-            kname = CL_KERNEL_SELECT("softmax_forward_slm");
+            kname = "softmax_forward_slm";
         else
-            kname = CL_KERNEL_SELECT("softmax_forward");
+            kname = "softmax_forward";
 
+        kname += format("%s", (use_half_) ? "_half" : "_float");
+        opts += format(" -D Dtype=%s -D DTYPE_MAX=%s", (use_half_) ? "half" : "float",
+                       (use_half_) ? "HALF_MAX" : "FLT_MAX");
         if (!oclk_softmax_forward_kernel.create(kname.c_str(), ocl::dnn::softmax_loss_oclsrc, opts))
             return false;
 
diff --git a/modules/dnn/src/opencl/activations.cl b/modules/dnn/src/opencl/activations.cl
index ab2532e533..9b5a9bb322 100644
--- a/modules/dnn/src/opencl/activations.cl
+++ b/modules/dnn/src/opencl/activations.cl
@@ -40,9 +40,17 @@
 //
 //M*/
 
+#define CONCAT(A,B) A##_##B
+#define TEMPLATE(name,type) CONCAT(name,type)
+#define KERNEL_ARG_DTYPE float
+
+#if defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
+
 __kernel void ReLUForward(const int count, __global const T* in, __global T* out
 #ifndef RELU_NO_SLOPE
-, T negative_slope
+, KERNEL_ARG_DTYPE negative_slope
 #endif
 ) {
   int index = get_global_id(0);
@@ -55,18 +63,19 @@ __kernel void ReLUForward(const int count, __global const T* in, __global T* out
 }
 
 __kernel void ReLU6Forward(const int count, __global const T* in, __global T* out,
-                           const T minValue, const T maxValue)
+                           const KERNEL_ARG_DTYPE minValue, const KERNEL_ARG_DTYPE maxValue)
 {
   int index = get_global_id(0);
   if(index < count)
   {
     T x = in[index];
-    out[index] = clamp(x, minValue, maxValue);
+    out[index] = clamp(x, convert_T(minValue), convert_T(maxValue));
   }
 }
 
 __kernel void PReLUForward(const int count, const int channels, const int plane_size,
-                           __global const T* in, __global T* out, __global const T* slope_data)
+                           __global const T* in, __global T* out,
+                           __global const KERNEL_ARG_DTYPE* slope_data)
 {
   int index = get_global_id(0);
   int c = (index / plane_size) % channels;
@@ -99,8 +108,22 @@ __kernel void AbsValForward(const int n, __global const T* in, __global T* out)
     out[index] = fabs(in[index]);
 }
 
-__kernel void PowForward(const int n, __global const T* in, __global T* out, const T power, const T scale, const T shift) {
+__kernel void PowForward(const int n, __global const T* in, __global T* out,
+                         const KERNEL_ARG_DTYPE power,
+                         const KERNEL_ARG_DTYPE scale,
+                         const KERNEL_ARG_DTYPE shift)
+{
   int index = get_global_id(0);
   if (index < n)
     out[index] = pow(shift + scale * in[index], power);
 }
+
+__kernel void ELUForward(const int n, __global const T* in, __global T* out)
+{
+  int index = get_global_id(0);
+  if (index < n)
+  {
+    T src = in[index];
+    out[index] = (src >= 0.f) ? src : exp(src) - 1;
+  }
+}
diff --git a/modules/dnn/src/opencl/batchnorm.cl b/modules/dnn/src/opencl/batchnorm.cl
index e0072c9fea..bdd3d0a443 100644
--- a/modules/dnn/src/opencl/batchnorm.cl
+++ b/modules/dnn/src/opencl/batchnorm.cl
@@ -40,24 +40,27 @@
 //
 //M*/
 
-#define Dtype float
-#define Dtype4 float4
-#define Dtype8 float8
+#if defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
 
 #if NUM == 8
     #define load(src, index) vload8(0, src + index)
     #define store(vec, dst, index) vstore8(vec, 0, dst + index)
-    #define vec_type Dtype8
+    #define float_type float8
+    #define convert_f convert_float8
     #define BATCH_NORM batch_norm8
 #elif NUM == 4
     #define load(src, index) vload4(0, src + index)
     #define store(vec, dst, index) vstore4(vec, 0, dst + index)
-    #define vec_type Dtype4
+    #define float_type float4
+    #define convert_f convert_float4
     #define BATCH_NORM batch_norm4
 #elif NUM == 1
     #define load(src, index) src[index]
     #define store(vec, dst, index) dst[index] = vec
-    #define vec_type Dtype
+    #define float_type float
+    #define convert_f convert_float
     #define BATCH_NORM batch_norm1
 #endif
 
@@ -65,8 +68,8 @@ __kernel void BATCH_NORM(__global const Dtype* src,
                          const int rows,
                          const int cols,
                          const int channels,
-                         __global const Dtype* weight,
-                         __global const Dtype* bias,
+                         __global const float* weight,
+                         __global const float* bias,
                          __global Dtype* dst)
 {
     int x = get_global_id(0);
@@ -76,9 +79,9 @@ __kernel void BATCH_NORM(__global const Dtype* src,
     if (x >= rows || y >= cols)
         return;
 
-    Dtype w = weight[x % channels];
-    Dtype b = bias[x % channels];
-    vec_type src_vec = load(src, index);
-    vec_type dst_vec = src_vec * w + (vec_type)b;
-    store(dst_vec, dst, index);
+    float w = weight[x % channels];
+    float b = bias[x % channels];
+    float_type src_vec = convert_f(load(src, index));
+    float_type dst_vec = src_vec * w + (float_type)b;
+    store(convert_T(dst_vec), dst, index);
 }
diff --git a/modules/dnn/src/opencl/concat.cl b/modules/dnn/src/opencl/concat.cl
index 041e6ac740..69fb75248b 100644
--- a/modules/dnn/src/opencl/concat.cl
+++ b/modules/dnn/src/opencl/concat.cl
@@ -39,22 +39,29 @@
 //
 //M*/
 
-__kernel void concat(const int nthreads,
-                     __global const Dtype* in_data,
-                     const int num_concats,
-                     const int concat_size,
-                     const int top_concat_axis,
-                     const int bottom_concat_axis,
-                     const int offset_concat_axis,
-                     __global Dtype* out_data) {
+#if defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
 
-  for (int index = get_global_id(0); index < nthreads;
-      index += get_global_size(0)) {
-    const int total_concat_size = concat_size * bottom_concat_axis;
-    const int concat_num = index / total_concat_size;
-    const int concat_index = index % total_concat_size;
-    const int top_index = concat_index
-        + (concat_num * top_concat_axis + offset_concat_axis) * concat_size;
-    out_data[top_index] = in_data[index];
-  }
+#define CONCAT(A,B) A##_##B
+#define TEMPLATE(name,type) CONCAT(name,type)
+
+__kernel void TEMPLATE(concat, Dtype)(const int nthreads,
+                                      __global const Dtype* in_data,
+                                      const int num_concats,
+                                      const int concat_size,
+                                      const int top_concat_axis,
+                                      const int bottom_concat_axis,
+                                      const int offset_concat_axis,
+                                      __global Dtype* out_data)
+{
+    for (int index = get_global_id(0); index < nthreads; index += get_global_size(0))
+    {
+        const int total_concat_size = concat_size * bottom_concat_axis;
+        const int concat_num = index / total_concat_size;
+        const int concat_index = index % total_concat_size;
+        const int top_index = concat_index +
+                              (concat_num * top_concat_axis + offset_concat_axis) * concat_size;
+        out_data[top_index] = in_data[index];
+    }
 }
diff --git a/modules/dnn/src/opencl/conv_layer_spatial.cl b/modules/dnn/src/opencl/conv_layer_spatial.cl
index 5308bf1d1a..621ab6f620 100644
--- a/modules/dnn/src/opencl/conv_layer_spatial.cl
+++ b/modules/dnn/src/opencl/conv_layer_spatial.cl
@@ -40,27 +40,29 @@
 //
 //M*/
 
-#if APPLY_BIAS
-#define BIAS_KERNEL_ARG __global Dtype * biases_base,
-#else
-#define BIAS_KERNEL_ARG
+#if defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #endif
 
+#define KERNEL_ARG_DTYPE float
+#define TYPE_FLOAT  1
+#define TYPE_HALF   2
+
 #if defined(FUSED_CONV_RELU)
-#define ACTIVATION_RELU_FUNCTION(x, c) ((Dtype)(x) > 0 ? (Dtype)(x) : ((Dtype)(x) * (Dtype)(negative_slope)))
-#define FUSED_ARG Dtype negative_slope,
+#define ACTIVATION_RELU_FUNCTION(x, c) ((Dtype)(x) > 0 ? (Dtype)(x) : ((Dtype)(x) * (negative_slope)))
+#define FUSED_ARG KERNEL_ARG_DTYPE negative_slope,
 #elif defined(FUSED_CONV_PRELU)
-#define ACTIVATION_RELU_FUNCTION(x, c) ((Dtype)(x) > 0 ? (Dtype)(x) : ((Dtype)(x) * (Dtype)(negative_slope[c])))
-#define FUSED_ARG __global const Dtype *negative_slope,
+#define ACTIVATION_RELU_FUNCTION(x, c) ((Dtype)(x) > 0 ? (Dtype)(x) : ((Dtype)(x) * (negative_slope[c])))
+#define FUSED_ARG __global const KERNEL_ARG_DTYPE* negative_slope,
 #elif defined(FUSED_CONV_POWER)
-#define ACTIVATION_RELU_FUNCTION(x, c) pow(x, power)
-#define FUSED_ARG Dtype power,
+#define ACTIVATION_RELU_FUNCTION(x, c) pow(x, (Dtype)power)
+#define FUSED_ARG KERNEL_ARG_DTYPE power,
 #elif defined(FUSED_CONV_TANH)
 #define ACTIVATION_RELU_FUNCTION(x, c) tanh(x)
 #define FUSED_ARG
 #elif defined(FUSED_CONV_RELU6)
-#define ACTIVATION_RELU_FUNCTION(x, c) (clamp((Dtype)(x), min_value, max_value))
-#define FUSED_ARG Dtype min_value, Dtype max_value,
+#define ACTIVATION_RELU_FUNCTION(x, c) (clamp((Dtype)(x), (Dtype)min_value, (Dtype)max_value))
+#define FUSED_ARG KERNEL_ARG_DTYPE min_value, KERNEL_ARG_DTYPE max_value,
 #else
 #define ACTIVATION_RELU_FUNCTION(x, c) (x)
 #define FUSED_ARG
@@ -74,6 +76,11 @@
 #define ELTWISE_DATA_ARG
 #endif
 
+#if APPLY_BIAS
+#define BIAS_KERNEL_ARG __global Dtype * biases_base,
+#else
+#define BIAS_KERNEL_ARG
+#endif
 
 #define __CAT(x, y) x##y
 #define CAT(x, y) __CAT(x, y)
@@ -97,6 +104,16 @@
 #define LOOP(N, VAR, STMT) CAT(LOOP, N)((VAR), (STMT))
 
 #if defined(convolve_simd) || defined(Conv_Interleaved)
+#if TYPE == TYPE_HALF
+#define INT_TYPE ushort
+#define INT_TYPE2 ushort2
+#define INT_TYPE4 ushort4
+#define INT_TYPE8 ushort8
+#define SUB_GROUP_BLOCK_READ2 intel_sub_group_block_read_us2
+#define SUB_GROUP_BLOCK_READ4 intel_sub_group_block_read_us4
+#define SUB_GROUP_BLOCK_READ8 intel_sub_group_block_read_us8
+#define SUB_GROUP_BLOCK_READ intel_sub_group_block_read_us
+#else
 #define INT_TYPE uint
 #define INT_TYPE2 uint2
 #define INT_TYPE4 uint4
@@ -106,6 +123,7 @@
 #define SUB_GROUP_BLOCK_READ8 intel_sub_group_block_read8
 #define SUB_GROUP_BLOCK_READ intel_sub_group_block_read
 #endif
+#endif
 
 #ifdef KERNEL_BASIC
 
@@ -418,6 +436,25 @@ typedef struct float15 { float s0; float s1; float s2; float s3; float s4; float
                          float s6; float s7; float s8; float s9; float sa; float sb; float sc; float sd; float se; } float15;
 typedef struct float0 { float s0; } float0; //never used but makes compiler happy.
 
+typedef struct half1 { half s0; } half1;
+typedef struct half5 { half s0; half s1; half s2; half s3; half s4; } half5;
+typedef struct half6 { half s0; half s1; half s2; half s3; half s4; half s5; } half6;
+typedef struct half7 { half s0; half s1; half s2; half s3; half s4; half s5; half s6; } half7;
+typedef struct half9 { half s0; half s1; half s2; half s3; half s4; half s5; half s6; half s7; half s8; } half9;
+typedef struct half10 { half s0; half s1; half s2; half s3; half s4; half s5;
+                        half s6; half s7; half s8; half s9; } half10;
+typedef struct half11 { half s0; half s1; half s2; half s3; half s4; half s5;
+                        half s6; half s7; half s8; half s9; half sa; } half11;
+typedef struct half12 { half s0; half s1; half s2; half s3; half s4; half s5;
+                        half s6; half s7; half s8; half s9; half sa; half sb; } half12;
+typedef struct half13 { half s0; half s1; half s2; half s3; half s4; half s5;
+                        half s6; half s7; half s8; half s9; half sa; half sb; half sc; } half13;
+typedef struct half14 { half s0; half s1; half s2; half s3; half s4; half s5;
+                        half s6; half s7; half s8; half s9; half sa; half sb; half sc; half sd; } half14;
+typedef struct half15 { half s0; half s1; half s2; half s3; half s4; half s5;
+                        half s6; half s7; half s8; half s9; half sa; half sb; half sc; half sd; half se; } half15;
+typedef struct half0 { half s0; } half0; //never used but makes compiler happy.
+
 #define OUT_PITCH_X output_width
 #define ROW_PITCH input_width
 
diff --git a/modules/dnn/src/opencl/eltwise.cl b/modules/dnn/src/opencl/eltwise.cl
index 6f3a374f54..80d3305126 100644
--- a/modules/dnn/src/opencl/eltwise.cl
+++ b/modules/dnn/src/opencl/eltwise.cl
@@ -40,9 +40,9 @@
 //
 //M*/
 
-#define Dtype float
-#define Dtype4 float4
-#define Dtype8 float8
+#if defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
 
 __kernel void op_sum4(__global const Dtype * A,
                       __global const Dtype * B,
@@ -73,20 +73,20 @@ __kernel void op_sum4(__global const Dtype * A,
         a2 = vload4(i, src0_read + 2 * A_col_size);
         a3 = vload4(i, src0_read + 3 * A_col_size);
 
-        dot0 = a0 * coeff1 + b0 * coeff2;
-        dot1 = a1 * coeff1 + b1 * coeff2;
-        dot2 = a2 * coeff1 + b2 * coeff2;
-        dot3 = a3 * coeff1 + b3 * coeff2;
+        dot0 = a0 * (Dtype4)coeff1 + b0 * (Dtype4)coeff2;
+        dot1 = a1 * (Dtype4)coeff1 + b1 * (Dtype4)coeff2;
+        dot2 = a2 * (Dtype4)coeff1 + b2 * (Dtype4)coeff2;
+        dot3 = a3 * (Dtype4)coeff1 + b3 * (Dtype4)coeff2;
 #else
         a0 = vload4(i, dst0_read);
         a1 = vload4(i, dst0_read + A_col_size);
         a2 = vload4(i, dst0_read + 2 * A_col_size);
         a3 = vload4(i, dst0_read + 3 * A_col_size);
 
-        dot0 = a0 + b0 * coeff2;
-        dot1 = a1 + b1 * coeff2;
-        dot2 = a2 + b2 * coeff2;
-        dot3 = a3 + b3 * coeff2;
+        dot0 = a0 + b0 * (Dtype4)coeff2;
+        dot1 = a1 + b1 * (Dtype4)coeff2;
+        dot2 = a2 + b2 * (Dtype4)coeff2;
+        dot3 = a3 + b3 * (Dtype4)coeff2;
 #endif
         vstore4(dot0, i, dst0_read);
         vstore4(dot1, i, dst0_read + A_col_size);
diff --git a/modules/dnn/src/opencl/gemm_buffer.cl b/modules/dnn/src/opencl/gemm_buffer.cl
new file mode 100644
index 0000000000..8cbc34dde5
--- /dev/null
+++ b/modules/dnn/src/opencl/gemm_buffer.cl
@@ -0,0 +1,1342 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
+
+#define CONCAT(A,B) A##_##B
+#define TEMPLATE(name,type) CONCAT(name,type)
+
+#define KERNEL_ARG_DTYPE float
+#define TYPE_FLOAT  1
+#define TYPE_HALF   2
+
+#if TYPE == TYPE_HALF
+#define Dtype  half
+#define Dtype2 half2
+#define Dtype4 half4
+#define Dtype8 half8
+#define Dtype16 half16
+
+#define as_Dtype  as_half
+#define as_Dtype2 as_half2
+#define as_Dtype4 as_half4
+#define as_Dtype8 as_half8
+#define as_Dtype16 as_half16
+#else
+#define Dtype  float
+#define Dtype2 float2
+#define Dtype4 float4
+#define Dtype8 float8
+#define Dtype16 float16
+
+#define as_Dtype  as_float
+#define as_Dtype2 as_float2
+#define as_Dtype4 as_float4
+#define as_Dtype8 as_float8
+#define as_Dtype16 as_float16
+#endif
+
+#if TYPE == TYPE_HALF
+#define SHUFFLE_TYPE2(val) as_ushort2(val)
+#define SHUFFLE_TYPE8(val) as_ushort8(val)
+#define SIMD_SIZE_GEMM 16
+#else
+#define SHUFFLE_TYPE2(val) val
+#define SHUFFLE_TYPE8(val) val
+#define SIMD_SIZE_GEMM 8
+#endif
+
+#if defined(cl_intel_subgroups)
+#pragma OPENCL EXTENSION  cl_intel_subgroups : enable
+#endif
+
+#define VEC_SIZE        4
+#define LWG_HEIGHT      4
+#define TILE_M          8
+#if TYPE == TYPE_HALF
+#define TILE_K          32
+#define TILE_N          64
+#else
+#define TILE_K          16
+#define TILE_N          32
+#endif
+
+__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, LWG_HEIGHT, 1)))
+__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM)))
+__kernel void TEMPLATE(gemm_buffer_NN, Dtype)(
+    const __global Dtype *src0, int off0,
+    const __global Dtype *src1, int off1,
+    __global Dtype *dst, int offd,
+    int M,
+    int N,
+    int K,
+    KERNEL_ARG_DTYPE alpha_in,
+    KERNEL_ARG_DTYPE beta_in,
+    int start_index)
+{
+    const Dtype alpha = (Dtype)alpha_in;
+    const Dtype beta = (Dtype)beta_in;
+    const int group_x = get_group_id(0);
+    const int group_y = get_group_id(1);
+    const int local_x = get_local_id(0);
+    const int local_y = get_local_id(1);
+    const int global_x = get_global_id(0);
+    const int global_y = get_global_id(1);
+
+    Dtype4 brow;
+    Dtype2 arow0, arow1, arow2, arow3, arow4, arow5, arow6, arow7;
+
+    __global Dtype *dst_write0 = dst + local_x * VEC_SIZE + (group_x * TILE_N) + (group_y * LWG_HEIGHT * TILE_M + local_y * TILE_M) * N + offd;
+
+    const __global Dtype *src0_read = src0 + local_x * (TILE_K / SIMD_SIZE_GEMM) + (group_y * LWG_HEIGHT * TILE_M + local_y * TILE_M) * K + start_index + off0;
+
+    const __global Dtype *src1_read0 = src1 + local_x * VEC_SIZE + (group_x * TILE_N) + start_index * N + off1;
+
+    int border = -(group_y * LWG_HEIGHT * TILE_M + local_y * TILE_M);
+
+    int row0 = mad24(global_y, TILE_M, 0) < M ? 0 : border;
+    int row1 = mad24(global_y, TILE_M, 1) < M ? 1 : border;
+    int row2 = mad24(global_y, TILE_M, 2) < M ? 2 : border;
+    int row3 = mad24(global_y, TILE_M, 3) < M ? 3 : border;
+    int row4 = mad24(global_y, TILE_M, 4) < M ? 4 : border;
+    int row5 = mad24(global_y, TILE_M, 5) < M ? 5 : border;
+    int row6 = mad24(global_y, TILE_M, 6) < M ? 6 : border;
+    int row7 = mad24(global_y, TILE_M, 7) < M ? 7 : border;
+
+    Dtype4 dot00 = (start_index != 0) ? vload4(0, dst_write0) : beta * vload4(0, dst_write0);
+    Dtype4 dot01 = (start_index != 0) ? vload4(0, dst_write0 + 1 * N) : beta * vload4(0, dst_write0 + 1 * N);
+    Dtype4 dot02 = (start_index != 0) ? vload4(0, dst_write0 + 2 * N) : beta * vload4(0, dst_write0 + 2 * N);
+    Dtype4 dot03 = (start_index != 0) ? vload4(0, dst_write0 + 3 * N) : beta * vload4(0, dst_write0 + 3 * N);
+    Dtype4 dot04 = (start_index != 0) ? vload4(0, dst_write0 + 4 * N) : beta * vload4(0, dst_write0 + 4 * N);
+    Dtype4 dot05 = (start_index != 0) ? vload4(0, dst_write0 + 5 * N) : beta * vload4(0, dst_write0 + 5 * N);
+    Dtype4 dot06 = (start_index != 0) ? vload4(0, dst_write0 + 6 * N) : beta * vload4(0, dst_write0 + 6 * N);
+    Dtype4 dot07 = (start_index != 0) ? vload4(0, dst_write0 + 7 * N) : beta * vload4(0, dst_write0 + 7 * N);
+
+    int end_index = min(start_index + 256, K);
+    int w = start_index;
+    while( w + TILE_K <= end_index ) {
+        arow0 = alpha * vload2(0, src0_read + row0 * K);
+        arow1 = alpha * vload2(0, src0_read + row1 * K);
+        arow2 = alpha * vload2(0, src0_read + row2 * K);
+        arow3 = alpha * vload2(0, src0_read + row3 * K);
+        arow4 = alpha * vload2(0, src0_read + row4 * K);
+        arow5 = alpha * vload2(0, src0_read + row5 * K);
+        arow6 = alpha * vload2(0, src0_read + row6 * K);
+        arow7 = alpha * vload2(0, src0_read + row7 * K);
+
+#define MM_DOT_PRODUCT( index, suffix )   \
+        brow = vload4(0, src1_read0);  src1_read0 += N; \
+        dot00 = mad( (Dtype4)(as_Dtype2(intel_sub_group_shuffle( SHUFFLE_TYPE2(arow0), index )).s##suffix), brow, dot00 ); \
+        dot01 = mad( (Dtype4)(as_Dtype2(intel_sub_group_shuffle( SHUFFLE_TYPE2(arow1), index )).s##suffix), brow, dot01 ); \
+        dot02 = mad( (Dtype4)(as_Dtype2(intel_sub_group_shuffle( SHUFFLE_TYPE2(arow2), index )).s##suffix), brow, dot02 ); \
+        dot03 = mad( (Dtype4)(as_Dtype2(intel_sub_group_shuffle( SHUFFLE_TYPE2(arow3), index )).s##suffix), brow, dot03 ); \
+        dot04 = mad( (Dtype4)(as_Dtype2(intel_sub_group_shuffle( SHUFFLE_TYPE2(arow4), index )).s##suffix), brow, dot04 ); \
+        dot05 = mad( (Dtype4)(as_Dtype2(intel_sub_group_shuffle( SHUFFLE_TYPE2(arow5), index )).s##suffix), brow, dot05 ); \
+        dot06 = mad( (Dtype4)(as_Dtype2(intel_sub_group_shuffle( SHUFFLE_TYPE2(arow6), index )).s##suffix), brow, dot06 ); \
+        dot07 = mad( (Dtype4)(as_Dtype2(intel_sub_group_shuffle( SHUFFLE_TYPE2(arow7), index )).s##suffix), brow, dot07 );
+
+        MM_DOT_PRODUCT(0, 0);
+        MM_DOT_PRODUCT(0, 1);
+        MM_DOT_PRODUCT(1, 0);
+        MM_DOT_PRODUCT(1, 1);
+        MM_DOT_PRODUCT(2, 0);
+        MM_DOT_PRODUCT(2, 1);
+        MM_DOT_PRODUCT(3, 0);
+        MM_DOT_PRODUCT(3, 1);
+        MM_DOT_PRODUCT(4, 0);
+        MM_DOT_PRODUCT(4, 1);
+        MM_DOT_PRODUCT(5, 0);
+        MM_DOT_PRODUCT(5, 1);
+        MM_DOT_PRODUCT(6, 0);
+        MM_DOT_PRODUCT(6, 1);
+        MM_DOT_PRODUCT(7, 0);
+        MM_DOT_PRODUCT(7, 1);
+#if TYPE == TYPE_HALF
+        MM_DOT_PRODUCT(8, 0);
+        MM_DOT_PRODUCT(8, 1);
+        MM_DOT_PRODUCT(9, 0);
+        MM_DOT_PRODUCT(9, 1);
+        MM_DOT_PRODUCT(10, 0);
+        MM_DOT_PRODUCT(10, 1);
+        MM_DOT_PRODUCT(11, 0);
+        MM_DOT_PRODUCT(11, 1);
+        MM_DOT_PRODUCT(12, 0);
+        MM_DOT_PRODUCT(12, 1);
+        MM_DOT_PRODUCT(13, 0);
+        MM_DOT_PRODUCT(13, 1);
+        MM_DOT_PRODUCT(14, 0);
+        MM_DOT_PRODUCT(14, 1);
+        MM_DOT_PRODUCT(15, 0);
+        MM_DOT_PRODUCT(15, 1);
+#endif
+#undef MM_DOT_PRODUCT
+
+        src0_read += TILE_K;
+        w += TILE_K;
+    }
+
+    if(w < end_index) {
+        arow0.x = ((w + local_x * 2) < K) ? alpha * (src0_read + row0 * K)[0] : 0.0f;
+        arow0.y = ((w + local_x * 2 + 1) < K) ? alpha * (src0_read + row0 * K)[1] : 0.0f;
+        arow1.x = ((w + local_x * 2) < K) ? alpha * (src0_read + row1 * K)[0] : 0.0f;
+        arow1.y = ((w + local_x * 2 + 1) < K) ? alpha * (src0_read + row1 * K)[1] : 0.0f;
+        arow2.x = ((w + local_x * 2) < K) ? alpha * (src0_read + row2 * K)[0] : 0.0f;
+        arow2.y = ((w + local_x * 2 + 1) < K) ? alpha * (src0_read + row2 * K)[1] : 0.0f;
+        arow3.x = ((w + local_x * 2) < K) ? alpha * (src0_read + row3 * K)[0] : 0.0f;
+        arow3.y = ((w + local_x * 2 + 1) < K) ? alpha * (src0_read + row3 * K)[1] : 0.0f;
+        arow4.x = ((w + local_x * 2) < K) ? alpha * (src0_read + row4 * K)[0] : 0.0f;
+        arow4.y = ((w + local_x * 2 + 1) < K) ? alpha * (src0_read + row4 * K)[1] : 0.0f;
+        arow5.x = ((w + local_x * 2) < K) ? alpha * (src0_read + row5 * K)[0] : 0.0f;
+        arow5.y = ((w + local_x * 2 + 1) < K) ? alpha * (src0_read + row5 * K)[1] : 0.0f;
+        arow6.x = ((w + local_x * 2) < K) ? alpha * (src0_read + row6 * K)[0] : 0.0f;
+        arow6.y = ((w + local_x * 2 + 1) < K) ? alpha * (src0_read + row6 * K)[1] : 0.0f;
+        arow7.x = ((w + local_x * 2) < K) ? alpha * (src0_read + row7 * K)[0] : 0.0f;
+        arow7.y = ((w + local_x * 2 + 1) < K) ? alpha * (src0_read + row7 * K)[1] : 0.0f;
+
+#define MM_DOT_PRODUCT( index, suffix )   \
+        brow = (w < K) ? vload4(0, src1_read0) : (Dtype4)0.0f;  src1_read0 += N; w++; \
+        dot00 = mad( (Dtype4)(as_Dtype2(intel_sub_group_shuffle( SHUFFLE_TYPE2(arow0), index )).s##suffix), brow, dot00 ); \
+        dot01 = mad( (Dtype4)(as_Dtype2(intel_sub_group_shuffle( SHUFFLE_TYPE2(arow1), index )).s##suffix), brow, dot01 ); \
+        dot02 = mad( (Dtype4)(as_Dtype2(intel_sub_group_shuffle( SHUFFLE_TYPE2(arow2), index )).s##suffix), brow, dot02 ); \
+        dot03 = mad( (Dtype4)(as_Dtype2(intel_sub_group_shuffle( SHUFFLE_TYPE2(arow3), index )).s##suffix), brow, dot03 ); \
+        dot04 = mad( (Dtype4)(as_Dtype2(intel_sub_group_shuffle( SHUFFLE_TYPE2(arow4), index )).s##suffix), brow, dot04 ); \
+        dot05 = mad( (Dtype4)(as_Dtype2(intel_sub_group_shuffle( SHUFFLE_TYPE2(arow5), index )).s##suffix), brow, dot05 ); \
+        dot06 = mad( (Dtype4)(as_Dtype2(intel_sub_group_shuffle( SHUFFLE_TYPE2(arow6), index )).s##suffix), brow, dot06 ); \
+        dot07 = mad( (Dtype4)(as_Dtype2(intel_sub_group_shuffle( SHUFFLE_TYPE2(arow7), index )).s##suffix), brow, dot07 );
+
+        MM_DOT_PRODUCT(0, 0);
+        MM_DOT_PRODUCT(0, 1);
+        MM_DOT_PRODUCT(1, 0);
+        MM_DOT_PRODUCT(1, 1);
+        MM_DOT_PRODUCT(2, 0);
+        MM_DOT_PRODUCT(2, 1);
+        MM_DOT_PRODUCT(3, 0);
+        MM_DOT_PRODUCT(3, 1);
+        MM_DOT_PRODUCT(4, 0);
+        MM_DOT_PRODUCT(4, 1);
+        MM_DOT_PRODUCT(5, 0);
+        MM_DOT_PRODUCT(5, 1);
+        MM_DOT_PRODUCT(6, 0);
+        MM_DOT_PRODUCT(6, 1);
+        MM_DOT_PRODUCT(7, 0);
+        MM_DOT_PRODUCT(7, 1);
+#if TYPE == TYPE_HALF
+        MM_DOT_PRODUCT(8, 0);
+        MM_DOT_PRODUCT(8, 1);
+        MM_DOT_PRODUCT(9, 0);
+        MM_DOT_PRODUCT(9, 1);
+        MM_DOT_PRODUCT(10, 0);
+        MM_DOT_PRODUCT(10, 1);
+        MM_DOT_PRODUCT(11, 0);
+        MM_DOT_PRODUCT(11, 1);
+        MM_DOT_PRODUCT(12, 0);
+        MM_DOT_PRODUCT(12, 1);
+        MM_DOT_PRODUCT(13, 0);
+        MM_DOT_PRODUCT(13, 1);
+        MM_DOT_PRODUCT(14, 0);
+        MM_DOT_PRODUCT(14, 1);
+        MM_DOT_PRODUCT(15, 0);
+        MM_DOT_PRODUCT(15, 1);
+#endif
+#undef MM_DOT_PRODUCT
+    }
+
+    if(global_x * 4 < N && global_y * 8 < M) {
+        if(mad24(global_x, 4, 3) < N) {
+            vstore4(dot00, 0, dst_write0); dst_write0 += N;
+            if(mad24(global_y, 8, 1) < M) { vstore4(dot01, 0, dst_write0); dst_write0 += N; }
+            else return;
+            if(mad24(global_y, 8, 2) < M) { vstore4(dot02, 0, dst_write0); dst_write0 += N; }
+            else return;
+            if(mad24(global_y, 8, 3) < M) { vstore4(dot03, 0, dst_write0); dst_write0 += N; }
+            else return;
+            if(mad24(global_y, 8, 4) < M) { vstore4(dot04, 0, dst_write0); dst_write0 += N; }
+            else return;
+            if(mad24(global_y, 8, 5) < M) { vstore4(dot05, 0, dst_write0); dst_write0 += N; }
+            else return;
+            if(mad24(global_y, 8, 6) < M) { vstore4(dot06, 0, dst_write0); dst_write0 += N; }
+            else return;
+            if(mad24(global_y, 8, 7) < M) { vstore4(dot07, 0, dst_write0); }
+        } else if(mad24(global_x, 4, 2) < N) {
+            vstore2(dot00.xy, 0, dst_write0);
+            dst_write0[2] = dot00.z;
+            dst_write0 += N;
+            if(mad24(global_y, 8, 1) < M) {
+                vstore2(dot01.xy, 0, dst_write0);
+                dst_write0[2] = dot01.z;
+                dst_write0 += N;
+            } else
+                return;
+            if(mad24(global_y, 8, 2) < M) {
+                vstore2(dot02.xy, 0, dst_write0);
+                dst_write0[2] = dot02.z;
+                dst_write0 += N;
+            } else
+                return;
+            if(mad24(global_y, 8, 3) < M) {
+                vstore2(dot03.xy, 0, dst_write0);
+                dst_write0[2] = dot03.z;
+                dst_write0 += N;
+            } else
+                return;
+            if(mad24(global_y, 8, 4) < M) {
+                vstore2(dot04.xy, 0, dst_write0);
+                dst_write0[2] = dot04.z;
+                dst_write0 += N;
+            } else
+                return;
+            if(mad24(global_y, 8, 5) < M) {
+                vstore2(dot05.xy, 0, dst_write0);
+                dst_write0[2] = dot05.z;
+                dst_write0 += N;
+            } else
+                return;
+            if(mad24(global_y, 8, 6) < M) {
+                vstore2(dot06.xy, 0, dst_write0);
+                dst_write0[2] = dot06.z;
+                dst_write0 += N;
+            } else
+                return;
+            if(mad24(global_y, 8, 7) < M) {
+                vstore2(dot07.xy, 0, dst_write0);
+                dst_write0[2] = dot07.z;
+            }
+        } else if(mad24(global_x, 4, 1) < N) {
+            vstore2(dot00.xy, 0, dst_write0); dst_write0 += N;
+            if(mad24(global_y, 8, 1) < M) { vstore2(dot01.xy, 0, dst_write0); dst_write0 += N; }
+            else return;
+            if(mad24(global_y, 8, 2) < M) { vstore2(dot02.xy, 0, dst_write0); dst_write0 += N; }
+            else return;
+            if(mad24(global_y, 8, 3) < M) { vstore2(dot03.xy, 0, dst_write0); dst_write0 += N; }
+            else return;
+            if(mad24(global_y, 8, 4) < M) { vstore2(dot04.xy, 0, dst_write0); dst_write0 += N; }
+            else return;
+            if(mad24(global_y, 8, 5) < M) { vstore2(dot05.xy, 0, dst_write0); dst_write0 += N; }
+            else return;
+            if(mad24(global_y, 8, 6) < M) { vstore2(dot06.xy, 0, dst_write0); dst_write0 += N; }
+            else return;
+            if(mad24(global_y, 8, 7) < M) { vstore2(dot07.xy, 0, dst_write0); }
+        } else {
+            dst_write0[0] = dot00.x; dst_write0 += N;
+            if(mad24(global_y, 8, 1) < M) { dst_write0[0] = dot01.x; dst_write0 += N; }
+            else return;
+            if(mad24(global_y, 8, 2) < M) { dst_write0[0] = dot02.x; dst_write0 += N; }
+            else return;
+            if(mad24(global_y, 8, 3) < M) { dst_write0[0] = dot03.x; dst_write0 += N; }
+            else return;
+            if(mad24(global_y, 8, 4) < M) { dst_write0[0] = dot04.x; dst_write0 += N; }
+            else return;
+            if(mad24(global_y, 8, 5) < M) { dst_write0[0] = dot05.x; dst_write0 += N; }
+            else return;
+            if(mad24(global_y, 8, 6) < M) { dst_write0[0] = dot06.x; dst_write0 += N; }
+            else return;
+            if(mad24(global_y, 8, 7) < M) { dst_write0[0] = dot07.x; }
+        }
+    }
+}
+
+#undef VEC_SIZE
+#undef LWG_HEIGHT
+#undef TILE_M
+#undef TILE_K
+#undef TILE_N
+
+#define VEC_SIZE        1
+#define TILE_M          8
+#define TILE_N          8
+#define SLM_BLOCK       128
+
+#if TYPE == TYPE_HALF
+#define LWG_HEIGHT      2
+#define TILE_K          64
+#else
+#define LWG_HEIGHT      4
+#define TILE_K          32
+#endif
+
+#if TYPE == TYPE_HALF
+__attribute__((reqd_work_group_size(8, LWG_HEIGHT, 1)))
+__attribute__((intel_reqd_sub_group_size(8)))
+__kernel void TEMPLATE(gemm_buffer_NT, Dtype)(
+    const __global Dtype *src0, int off0,
+    const __global Dtype *src1, int off1,
+    __global Dtype *dst, int offd,
+    int M,
+    int N,
+    int K,
+    KERNEL_ARG_DTYPE alpha_in,
+    KERNEL_ARG_DTYPE beta_in)
+{
+    const Dtype alpha = (Dtype)alpha_in;
+    const Dtype beta = (Dtype)beta_in;
+    const int group_x = get_group_id(0);
+    const int group_y = get_group_id(1);
+    const int local_x = get_local_id(0);
+    const int local_y = get_local_id(1);
+    const int global_x = get_global_id(0);
+    const int global_y = get_global_id(1);
+
+    Dtype8 dot00 = 0.f;
+    Dtype8 dot01 = 0.f;
+    Dtype8 dot02 = 0.f;
+    Dtype8 dot03 = 0.f;
+    Dtype8 dot04 = 0.f;
+    Dtype8 dot05 = 0.f;
+    Dtype8 dot06 = 0.f;
+    Dtype8 dot07 = 0.f;
+
+    Dtype8 brow0;
+    Dtype8 brow1;
+    Dtype8 brow2;
+    Dtype8 brow3;
+    Dtype8 brow4;
+    Dtype8 brow5;
+    Dtype8 brow6;
+    Dtype8 brow7;
+
+    __global Dtype *dst_write0 = dst + local_x * VEC_SIZE + (group_x * TILE_N) + (group_y * LWG_HEIGHT * TILE_M + local_y * TILE_M) * N + offd;
+
+    const __global Dtype *src0_read = src0 + local_x * (TILE_K / 8) + (group_y * LWG_HEIGHT * TILE_M + local_y * TILE_M) * K + off0;
+
+    const __global Dtype *src1_read0 = src1 + (group_x * TILE_N) * K + off1;
+
+    __local Dtype slm_brow[8 * SLM_BLOCK];
+    __local Dtype* slm_brow0;
+
+    int local_index = mad24(local_y, 8, local_x) * 8;
+    int w;
+    for(int b_tile = 0; b_tile < K; b_tile += SLM_BLOCK) {
+        barrier(CLK_LOCAL_MEM_FENCE);
+        vstore4(vload4(0, (__global float *)(src1_read0 + mad24(0, K, local_index))), 0, (__local float *)(slm_brow + mad24(0, SLM_BLOCK, local_index)));
+        vstore4(vload4(0, (__global float *)(src1_read0 + mad24(1, K, local_index))), 0, (__local float *)(slm_brow + mad24(1, SLM_BLOCK, local_index)));
+        vstore4(vload4(0, (__global float *)(src1_read0 + mad24(2, K, local_index))), 0, (__local float *)(slm_brow + mad24(2, SLM_BLOCK, local_index)));
+        vstore4(vload4(0, (__global float *)(src1_read0 + mad24(3, K, local_index))), 0, (__local float *)(slm_brow + mad24(3, SLM_BLOCK, local_index)));
+        vstore4(vload4(0, (__global float *)(src1_read0 + mad24(4, K, local_index))), 0, (__local float *)(slm_brow + mad24(4, SLM_BLOCK, local_index)));
+        vstore4(vload4(0, (__global float *)(src1_read0 + mad24(5, K, local_index))), 0, (__local float *)(slm_brow + mad24(5, SLM_BLOCK, local_index)));
+        vstore4(vload4(0, (__global float *)(src1_read0 + mad24(6, K, local_index))), 0, (__local float *)(slm_brow + mad24(6, SLM_BLOCK, local_index)));
+        vstore4(vload4(0, (__global float *)(src1_read0 + mad24(7, K, local_index))), 0, (__local float *)(slm_brow + mad24(7, SLM_BLOCK, local_index)));
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        slm_brow0 = slm_brow + local_x * (TILE_K / 8);
+        w = b_tile;
+        int end_w = min(b_tile + SLM_BLOCK, K);
+        while( w + TILE_K <= end_w ) {
+            Dtype8 arow;
+
+            brow0 = as_half8(vload4(0, (__local float *)(slm_brow0 + 0 * SLM_BLOCK)));
+            brow1 = as_half8(vload4(0, (__local float *)(slm_brow0 + 1 * SLM_BLOCK)));
+            brow2 = as_half8(vload4(0, (__local float *)(slm_brow0 + 2 * SLM_BLOCK)));
+            brow3 = as_half8(vload4(0, (__local float *)(slm_brow0 + 3 * SLM_BLOCK)));
+            brow4 = as_half8(vload4(0, (__local float *)(slm_brow0 + 4 * SLM_BLOCK)));
+            brow5 = as_half8(vload4(0, (__local float *)(slm_brow0 + 5 * SLM_BLOCK)));
+            brow6 = as_half8(vload4(0, (__local float *)(slm_brow0 + 6 * SLM_BLOCK)));
+            brow7 = as_half8(vload4(0, (__local float *)(slm_brow0 + 7 * SLM_BLOCK)));
+
+#define MM_DOT_PRODUCT( _row, _dot )   \
+            arow = as_half8(vload4(0, (__global float *)(src0_read + _row * K)));                           \
+            _dot = mad( (Dtype8)(arow.s0), (Dtype8)(brow0.s0, brow1.s0, brow2.s0, brow3.s0, brow4.s0, brow5.s0, brow6.s0, brow7.s0), _dot ); \
+            _dot = mad( (Dtype8)(arow.s1), (Dtype8)(brow0.s1, brow1.s1, brow2.s1, brow3.s1, brow4.s1, brow5.s1, brow6.s1, brow7.s1), _dot ); \
+            _dot = mad( (Dtype8)(arow.s2), (Dtype8)(brow0.s2, brow1.s2, brow2.s2, brow3.s2, brow4.s2, brow5.s2, brow6.s2, brow7.s2), _dot ); \
+            _dot = mad( (Dtype8)(arow.s3), (Dtype8)(brow0.s3, brow1.s3, brow2.s3, brow3.s3, brow4.s3, brow5.s3, brow6.s3, brow7.s3), _dot ); \
+            _dot = mad( (Dtype8)(arow.s4), (Dtype8)(brow0.s4, brow1.s4, brow2.s4, brow3.s4, brow4.s4, brow5.s4, brow6.s4, brow7.s4), _dot ); \
+            _dot = mad( (Dtype8)(arow.s5), (Dtype8)(brow0.s5, brow1.s5, brow2.s5, brow3.s5, brow4.s5, brow5.s5, brow6.s5, brow7.s5), _dot ); \
+            _dot = mad( (Dtype8)(arow.s6), (Dtype8)(brow0.s6, brow1.s6, brow2.s6, brow3.s6, brow4.s6, brow5.s6, brow6.s6, brow7.s6), _dot ); \
+            _dot = mad( (Dtype8)(arow.s7), (Dtype8)(brow0.s7, brow1.s7, brow2.s7, brow3.s7, brow4.s7, brow5.s7, brow6.s7, brow7.s7), _dot );
+
+            MM_DOT_PRODUCT( 0, dot00 );
+            MM_DOT_PRODUCT( 1, dot01 );
+            MM_DOT_PRODUCT( 2, dot02 );
+            MM_DOT_PRODUCT( 3, dot03 );
+            MM_DOT_PRODUCT( 4, dot04 );
+            MM_DOT_PRODUCT( 5, dot05 );
+            MM_DOT_PRODUCT( 6, dot06 );
+            MM_DOT_PRODUCT( 7, dot07 );
+#undef MM_DOT_PRODUCT
+
+            src0_read += TILE_K;
+            slm_brow0 += TILE_K;
+            w += TILE_K;
+        }
+        src1_read0 += SLM_BLOCK;
+    }
+
+    if(w < K) {
+        Dtype8 arow;
+
+#define READ_BROW(_brow, _row) \
+        _brow = as_half8(vload4(0, (__local float *)(slm_brow0 + _row * SLM_BLOCK))); \
+        _brow.s0 = (mad24(local_x, 8, w) < K) ? _brow.s0 : 0.0f; \
+        _brow.s1 = (mad24(local_x, 8, w + 1) < K) ? _brow.s1 : 0.0f; \
+        _brow.s2 = (mad24(local_x, 8, w + 2) < K) ? _brow.s2 : 0.0f; \
+        _brow.s3 = (mad24(local_x, 8, w + 3) < K) ? _brow.s3 : 0.0f; \
+        _brow.s4 = (mad24(local_x, 8, w + 4) < K) ? _brow.s4 : 0.0f; \
+        _brow.s5 = (mad24(local_x, 8, w + 5) < K) ? _brow.s5 : 0.0f; \
+        _brow.s6 = (mad24(local_x, 8, w + 6) < K) ? _brow.s6 : 0.0f; \
+        _brow.s7 = (mad24(local_x, 8, w + 7) < K) ? _brow.s7 : 0.0f;
+
+        READ_BROW(brow0, 0);
+        READ_BROW(brow1, 1);
+        READ_BROW(brow2, 2);
+        READ_BROW(brow3, 3);
+        READ_BROW(brow4, 4);
+        READ_BROW(brow5, 5);
+        READ_BROW(brow6, 6);
+        READ_BROW(brow7, 7);
+
+#undef READ_BROW
+
+#define MM_DOT_PRODUCT( _row, _dot )   \
+        arow = as_half8(vload4(0, (__global float *)(src0_read + _row * K)));                           \
+        arow.s0 = (mad24(local_x, 8, w) < K) ? arow.s0 : 0.0f; \
+        arow.s1 = (mad24(local_x, 8, w + 1) < K) ? arow.s1 : 0.0f; \
+        arow.s2 = (mad24(local_x, 8, w + 2) < K) ? arow.s2 : 0.0f; \
+        arow.s3 = (mad24(local_x, 8, w + 3) < K) ? arow.s3 : 0.0f; \
+        arow.s4 = (mad24(local_x, 8, w + 4) < K) ? arow.s4 : 0.0f; \
+        arow.s5 = (mad24(local_x, 8, w + 5) < K) ? arow.s5 : 0.0f; \
+        arow.s6 = (mad24(local_x, 8, w + 6) < K) ? arow.s6 : 0.0f; \
+        arow.s7 = (mad24(local_x, 8, w + 7) < K) ? arow.s7 : 0.0f; \
+        _dot = mad( (Dtype8)(arow.s0), (Dtype8)(brow0.s0, brow1.s0, brow2.s0, brow3.s0, brow4.s0, brow5.s0, brow6.s0, brow7.s0), _dot ); \
+        _dot = mad( (Dtype8)(arow.s1), (Dtype8)(brow0.s1, brow1.s1, brow2.s1, brow3.s1, brow4.s1, brow5.s1, brow6.s1, brow7.s1), _dot ); \
+        _dot = mad( (Dtype8)(arow.s2), (Dtype8)(brow0.s2, brow1.s2, brow2.s2, brow3.s2, brow4.s2, brow5.s2, brow6.s2, brow7.s2), _dot ); \
+        _dot = mad( (Dtype8)(arow.s3), (Dtype8)(brow0.s3, brow1.s3, brow2.s3, brow3.s3, brow4.s3, brow5.s3, brow6.s3, brow7.s3), _dot ); \
+        _dot = mad( (Dtype8)(arow.s4), (Dtype8)(brow0.s4, brow1.s4, brow2.s4, brow3.s4, brow4.s4, brow5.s4, brow6.s4, brow7.s4), _dot ); \
+        _dot = mad( (Dtype8)(arow.s5), (Dtype8)(brow0.s5, brow1.s5, brow2.s5, brow3.s5, brow4.s5, brow5.s5, brow6.s5, brow7.s5), _dot ); \
+        _dot = mad( (Dtype8)(arow.s6), (Dtype8)(brow0.s6, brow1.s6, brow2.s6, brow3.s6, brow4.s6, brow5.s6, brow6.s6, brow7.s6), _dot ); \
+        _dot = mad( (Dtype8)(arow.s7), (Dtype8)(brow0.s7, brow1.s7, brow2.s7, brow3.s7, brow4.s7, brow5.s7, brow6.s7, brow7.s7), _dot );
+
+        MM_DOT_PRODUCT( 0, dot00 );
+        MM_DOT_PRODUCT( 1, dot01 );
+        MM_DOT_PRODUCT( 2, dot02 );
+        MM_DOT_PRODUCT( 3, dot03 );
+        MM_DOT_PRODUCT( 4, dot04 );
+        MM_DOT_PRODUCT( 5, dot05 );
+        MM_DOT_PRODUCT( 6, dot06 );
+        MM_DOT_PRODUCT( 7, dot07 );
+#undef MM_DOT_PRODUCT
+    }
+
+#define REDUCE(_dot) \
+    _dot = as_Dtype8(intel_sub_group_shuffle(SHUFFLE_TYPE8(_dot), 0)) + as_Dtype8(intel_sub_group_shuffle(SHUFFLE_TYPE8(_dot), 1)) + as_Dtype8(intel_sub_group_shuffle(SHUFFLE_TYPE8(_dot), 2)) + as_Dtype8(intel_sub_group_shuffle(SHUFFLE_TYPE8(_dot), 3)) +  \
+           as_Dtype8(intel_sub_group_shuffle(SHUFFLE_TYPE8(_dot), 4)) + as_Dtype8(intel_sub_group_shuffle(SHUFFLE_TYPE8(_dot), 5)) + as_Dtype8(intel_sub_group_shuffle(SHUFFLE_TYPE8(_dot), 6)) + as_Dtype8(intel_sub_group_shuffle(SHUFFLE_TYPE8(_dot), 7));
+
+    REDUCE(dot00);
+    REDUCE(dot01);
+    REDUCE(dot02);
+    REDUCE(dot03);
+    REDUCE(dot04);
+    REDUCE(dot05);
+    REDUCE(dot06);
+    REDUCE(dot07);
+#undef REDUCE
+
+    Dtype output = 0.0f;
+#define OUTPUT( _dot) \
+    output = (local_x == 0) ? _dot.s0 : output; \
+    output = (local_x == 1) ? _dot.s1 : output; \
+    output = (local_x == 2) ? _dot.s2 : output; \
+    output = (local_x == 3) ? _dot.s3 : output; \
+    output = (local_x == 4) ? _dot.s4 : output; \
+    output = (local_x == 5) ? _dot.s5 : output; \
+    output = (local_x == 6) ? _dot.s6 : output; \
+    output = (local_x == 7) ? _dot.s7 : output; \
+    dst_write0[0] = mad(output, alpha, beta * dst_write0[0]); \
+    dst_write0 += N;
+
+    if(global_x < N && global_y * 8 < M) {
+        OUTPUT(dot00);
+        if(mad24(global_y, 8, 1) < M) { OUTPUT(dot01); }
+        if(mad24(global_y, 8, 2) < M) { OUTPUT(dot02); }
+        if(mad24(global_y, 8, 3) < M) { OUTPUT(dot03); }
+        if(mad24(global_y, 8, 4) < M) { OUTPUT(dot04); }
+        if(mad24(global_y, 8, 5) < M) { OUTPUT(dot05); }
+        if(mad24(global_y, 8, 6) < M) { OUTPUT(dot06); }
+        if(mad24(global_y, 8, 7) < M) { OUTPUT(dot07); }
+    }
+#undef OUTPUT
+}
+
+#else
+
+__attribute__((reqd_work_group_size(8, LWG_HEIGHT, 1)))
+__attribute__((intel_reqd_sub_group_size(8)))
+__kernel void TEMPLATE(gemm_buffer_NT, Dtype)(
+    const __global Dtype *src0, int off0,
+    const __global Dtype *src1, int off1,
+    __global Dtype *dst, int offd,
+    int M,
+    int N,
+    int K,
+    KERNEL_ARG_DTYPE alpha_in,
+    KERNEL_ARG_DTYPE beta_in)
+{
+    const Dtype alpha = (Dtype)alpha_in;
+    const Dtype beta = (Dtype)beta_in;
+    const int group_x = get_group_id(0);
+    const int group_y = get_group_id(1);
+    const int local_x = get_local_id(0);
+    const int local_y = get_local_id(1);
+    const int global_x = get_global_id(0);
+    const int global_y = get_global_id(1);
+
+    Dtype8 dot00 = 0.f;
+    Dtype8 dot01 = 0.f;
+    Dtype8 dot02 = 0.f;
+    Dtype8 dot03 = 0.f;
+    Dtype8 dot04 = 0.f;
+    Dtype8 dot05 = 0.f;
+    Dtype8 dot06 = 0.f;
+    Dtype8 dot07 = 0.f;
+
+    Dtype4 brow0;
+    Dtype4 brow1;
+    Dtype4 brow2;
+    Dtype4 brow3;
+    Dtype4 brow4;
+    Dtype4 brow5;
+    Dtype4 brow6;
+    Dtype4 brow7;
+
+    __global Dtype *dst_write0 = dst + local_x * VEC_SIZE + (group_x * TILE_N) + (group_y * LWG_HEIGHT * TILE_M + local_y * TILE_M) * N + offd;
+
+    const __global Dtype *src0_read = src0 + local_x * (TILE_K / 8) + (group_y * LWG_HEIGHT * TILE_M + local_y * TILE_M) * K + off0;
+
+    const __global Dtype *src1_read0 = src1 + (group_x * TILE_N) * K + off1;
+
+    __local Dtype slm_brow[8 * SLM_BLOCK];
+    __local Dtype* slm_brow0;
+
+    int local_index = mad24(local_y, 8, local_x) * 4;
+    int w;
+    for(int b_tile = 0; b_tile < K; b_tile += SLM_BLOCK) {
+        barrier(CLK_LOCAL_MEM_FENCE);
+        vstore4(vload4(0, src1_read0 + mad24(0, K, local_index)), 0, slm_brow + mad24(0, SLM_BLOCK, local_index));
+        vstore4(vload4(0, src1_read0 + mad24(1, K, local_index)), 0, slm_brow + mad24(1, SLM_BLOCK, local_index));
+        vstore4(vload4(0, src1_read0 + mad24(2, K, local_index)), 0, slm_brow + mad24(2, SLM_BLOCK, local_index));
+        vstore4(vload4(0, src1_read0 + mad24(3, K, local_index)), 0, slm_brow + mad24(3, SLM_BLOCK, local_index));
+        vstore4(vload4(0, src1_read0 + mad24(4, K, local_index)), 0, slm_brow + mad24(4, SLM_BLOCK, local_index));
+        vstore4(vload4(0, src1_read0 + mad24(5, K, local_index)), 0, slm_brow + mad24(5, SLM_BLOCK, local_index));
+        vstore4(vload4(0, src1_read0 + mad24(6, K, local_index)), 0, slm_brow + mad24(6, SLM_BLOCK, local_index));
+        vstore4(vload4(0, src1_read0 + mad24(7, K, local_index)), 0, slm_brow + mad24(7, SLM_BLOCK, local_index));
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        slm_brow0 = slm_brow + local_x * (TILE_K / 8);
+        w = b_tile;
+        int end_w = min(b_tile + SLM_BLOCK, K);
+        while( w + TILE_K <= end_w ) {
+            Dtype4 arow;
+
+            brow0 = vload4(0, slm_brow0 + 0 * SLM_BLOCK);
+            brow1 = vload4(0, slm_brow0 + 1 * SLM_BLOCK);
+            brow2 = vload4(0, slm_brow0 + 2 * SLM_BLOCK);
+            brow3 = vload4(0, slm_brow0 + 3 * SLM_BLOCK);
+            brow4 = vload4(0, slm_brow0 + 4 * SLM_BLOCK);
+            brow5 = vload4(0, slm_brow0 + 5 * SLM_BLOCK);
+            brow6 = vload4(0, slm_brow0 + 6 * SLM_BLOCK);
+            brow7 = vload4(0, slm_brow0 + 7 * SLM_BLOCK);
+
+#define MM_DOT_PRODUCT( _row, _dot )   \
+            arow = vload4(0, src0_read + _row * K);                           \
+            _dot = mad( (Dtype8)(arow.x), (Dtype8)(brow0.x, brow1.x, brow2.x, brow3.x, brow4.x, brow5.x, brow6.x, brow7.x), _dot ); \
+            _dot = mad( (Dtype8)(arow.y), (Dtype8)(brow0.y, brow1.y, brow2.y, brow3.y, brow4.y, brow5.y, brow6.y, brow7.y), _dot ); \
+            _dot = mad( (Dtype8)(arow.z), (Dtype8)(brow0.z, brow1.z, brow2.z, brow3.z, brow4.z, brow5.z, brow6.z, brow7.z), _dot ); \
+            _dot = mad( (Dtype8)(arow.w), (Dtype8)(brow0.w, brow1.w, brow2.w, brow3.w, brow4.w, brow5.w, brow6.w, brow7.w), _dot );
+
+            MM_DOT_PRODUCT( 0, dot00 );
+            MM_DOT_PRODUCT( 1, dot01 );
+            MM_DOT_PRODUCT( 2, dot02 );
+            MM_DOT_PRODUCT( 3, dot03 );
+            MM_DOT_PRODUCT( 4, dot04 );
+            MM_DOT_PRODUCT( 5, dot05 );
+            MM_DOT_PRODUCT( 6, dot06 );
+            MM_DOT_PRODUCT( 7, dot07 );
+#undef MM_DOT_PRODUCT
+
+            src0_read += TILE_K;
+            slm_brow0 += TILE_K;
+            w += TILE_K;
+        }
+        src1_read0 += SLM_BLOCK;
+    }
+
+    if(w < K) {
+        Dtype4 arow;
+
+#define READ_BROW(_brow, _row) \
+        _brow = vload4(0, slm_brow0 + _row * SLM_BLOCK); \
+        _brow.x = (mad24(local_x, 4, w) < K) ? _brow.x : 0.0f; \
+        _brow.y = (mad24(local_x, 4, w + 1) < K) ? _brow.y : 0.0f; \
+        _brow.z = (mad24(local_x, 4, w + 2) < K) ? _brow.z : 0.0f; \
+        _brow.w = (mad24(local_x, 4, w + 3) < K) ? _brow.w : 0.0f;
+
+        READ_BROW(brow0, 0);
+        READ_BROW(brow1, 1);
+        READ_BROW(brow2, 2);
+        READ_BROW(brow3, 3);
+        READ_BROW(brow4, 4);
+        READ_BROW(brow5, 5);
+        READ_BROW(brow6, 6);
+        READ_BROW(brow7, 7);
+
+#undef READ_BROW
+
+#define MM_DOT_PRODUCT( _row, _dot )   \
+        arow = vload4(0, src0_read + _row * K);                           \
+        arow.x = (mad24(local_x, 4, w) < K) ? arow.x : 0.0f; \
+        arow.y = (mad24(local_x, 4, w + 1) < K) ? arow.y : 0.0f; \
+        arow.z = (mad24(local_x, 4, w + 2) < K) ? arow.z : 0.0f; \
+        arow.w = (mad24(local_x, 4, w + 3) < K) ? arow.w : 0.0f; \
+        _dot = mad( (Dtype8)(arow.x), (Dtype8)(brow0.x, brow1.x, brow2.x, brow3.x, brow4.x, brow5.x, brow6.x, brow7.x), _dot ); \
+        _dot = mad( (Dtype8)(arow.y), (Dtype8)(brow0.y, brow1.y, brow2.y, brow3.y, brow4.y, brow5.y, brow6.y, brow7.y), _dot ); \
+        _dot = mad( (Dtype8)(arow.z), (Dtype8)(brow0.z, brow1.z, brow2.z, brow3.z, brow4.z, brow5.z, brow6.z, brow7.z), _dot ); \
+        _dot = mad( (Dtype8)(arow.w), (Dtype8)(brow0.w, brow1.w, brow2.w, brow3.w, brow4.w, brow5.w, brow6.w, brow7.w), _dot );
+
+        MM_DOT_PRODUCT( 0, dot00 );
+        MM_DOT_PRODUCT( 1, dot01 );
+        MM_DOT_PRODUCT( 2, dot02 );
+        MM_DOT_PRODUCT( 3, dot03 );
+        MM_DOT_PRODUCT( 4, dot04 );
+        MM_DOT_PRODUCT( 5, dot05 );
+        MM_DOT_PRODUCT( 6, dot06 );
+        MM_DOT_PRODUCT( 7, dot07 );
+#undef MM_DOT_PRODUCT
+    }
+
+#define REDUCE(_dot) \
+    _dot = as_Dtype8(intel_sub_group_shuffle(SHUFFLE_TYPE8(_dot), 0)) + as_Dtype8(intel_sub_group_shuffle(SHUFFLE_TYPE8(_dot), 1)) + as_Dtype8(intel_sub_group_shuffle(SHUFFLE_TYPE8(_dot), 2)) + as_Dtype8(intel_sub_group_shuffle(SHUFFLE_TYPE8(_dot), 3)) +  \
+           as_Dtype8(intel_sub_group_shuffle(SHUFFLE_TYPE8(_dot), 4)) + as_Dtype8(intel_sub_group_shuffle(SHUFFLE_TYPE8(_dot), 5)) + as_Dtype8(intel_sub_group_shuffle(SHUFFLE_TYPE8(_dot), 6)) + as_Dtype8(intel_sub_group_shuffle(SHUFFLE_TYPE8(_dot), 7));
+
+    REDUCE(dot00);
+    REDUCE(dot01);
+    REDUCE(dot02);
+    REDUCE(dot03);
+    REDUCE(dot04);
+    REDUCE(dot05);
+    REDUCE(dot06);
+    REDUCE(dot07);
+#undef REDUCE
+
+    Dtype output = 0.0f;
+#define OUTPUT( _dot) \
+    output = (local_x == 0) ? _dot.s0 : output; \
+    output = (local_x == 1) ? _dot.s1 : output; \
+    output = (local_x == 2) ? _dot.s2 : output; \
+    output = (local_x == 3) ? _dot.s3 : output; \
+    output = (local_x == 4) ? _dot.s4 : output; \
+    output = (local_x == 5) ? _dot.s5 : output; \
+    output = (local_x == 6) ? _dot.s6 : output; \
+    output = (local_x == 7) ? _dot.s7 : output; \
+    dst_write0[0] = mad(output, alpha, beta * dst_write0[0]); \
+    dst_write0 += N;
+
+    if(global_x < N && global_y * 8 < M) {
+        OUTPUT(dot00);
+        if(mad24(global_y, 8, 1) < M) { OUTPUT(dot01); }
+        if(mad24(global_y, 8, 2) < M) { OUTPUT(dot02); }
+        if(mad24(global_y, 8, 3) < M) { OUTPUT(dot03); }
+        if(mad24(global_y, 8, 4) < M) { OUTPUT(dot04); }
+        if(mad24(global_y, 8, 5) < M) { OUTPUT(dot05); }
+        if(mad24(global_y, 8, 6) < M) { OUTPUT(dot06); }
+        if(mad24(global_y, 8, 7) < M) { OUTPUT(dot07); }
+    }
+#undef OUTPUT
+}
+#endif
+
+#undef VEC_SIZE
+#undef LWG_HEIGHT
+#undef TILE_M
+#undef TILE_K
+#undef TILE_N
+#undef SLM_BLOCK
+
+#define SLM_SIZE 64
+void TEMPLATE(gemm_buffer_NT_M_2_edgerows,Dtype)(
+                           const __global Dtype* srca_read0,
+                           const __global Dtype* srca_read1,
+                           const __global Dtype* srcb_read,
+                           __local Dtype4* work0,
+                           __local Dtype4* work1,
+                           int N,
+                           int K,
+                           int x_gid,
+                           int lid,
+                           Dtype alpha,
+                           Dtype beta,
+                           __global Dtype* dstc0,
+                           __global Dtype* dstc1)
+{
+  __local Dtype* work_each0 = (__local Dtype*)work0;
+  __local Dtype* work_each1 = (__local Dtype*)work1;
+
+  int rows = N - x_gid * 4;
+
+  Dtype4 dot0[3] = {(Dtype4)(0.), (Dtype4)(0.), (Dtype4)(0.)};
+  Dtype4 dot1[3] = {(Dtype4)(0.), (Dtype4)(0.), (Dtype4)(0.)};
+
+  int i = lid;
+  while( i < K / 4) {
+    const Dtype4 b0 = {srca_read0[i*4], srca_read0[(i*4+1)], srca_read0[(i*4+2)], srca_read0[(i*4+3)]};
+    const Dtype4 b1 = {srca_read1[i*4], srca_read1[(i*4+1)], srca_read1[(i*4+2)], srca_read1[(i*4+3)]};
+#pragma unroll
+    for(int j = 0; j < rows; ++j) {
+      dot0[j] += b0 * vload4(i, srcb_read + j * K);
+      dot1[j] += b1 * vload4(i, srcb_read + j * K);
+    }
+
+    i += get_local_size(0);
+  }
+#pragma unroll
+  for(int j = 0; j < rows; ++j) {
+    work_each0[lid * 4 + j] = dot0[j].x + dot0[j].y + dot0[j].z + dot0[j].w;
+    work_each1[lid * 4 + j] = dot1[j].x + dot1[j].y + dot1[j].z + dot1[j].w;
+  }
+
+  if(i == K / 4) {
+    short tail_items = K % 4;
+
+    if(tail_items != 0) {
+      const __global Dtype *srcb_tail = srcb_read + i * 4;
+      const __global Dtype *srca_tail0 = srca_read0 + i * 4;
+      const __global Dtype *srca_tail1 = srca_read1 + i * 4;
+#pragma unroll
+      for(short i = 0; i < tail_items; ++i) {
+        const Dtype at0 = srca_tail0[i];
+        const Dtype at1 = srca_tail1[i];
+#pragma unroll
+        for(int j = 0; j < rows; ++j) {
+          work_each0[lid * 4 + j] += at0 * srcb_tail[i + j * K];
+          work_each1[lid * 4 + j] += at1 * srcb_tail[i + j * K];
+        }
+      }
+    }
+  }
+
+  for(int stride = get_local_size(0) >> 1; stride > 0 ; stride >>= 1) {
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if(lid < stride) {
+      work0[lid] += work0[lid+stride];
+      work1[lid] += work1[lid+stride];
+    }
+  }
+
+  if(lid == 0) {
+#pragma unroll
+    for(int j = 0; j < rows; ++j) {
+      dstc0[(x_gid * 4  + j)] = alpha * work_each0[j] + beta * dstc0[(x_gid * 4 + j)];
+      dstc1[(x_gid * 4  + j)] = alpha * work_each1[j] + beta * dstc1[(x_gid * 4 + j)];
+    }
+  }
+}
+
+__kernel void TEMPLATE(gemm_buffer_NT_M_2,Dtype)(
+          __global const Dtype * A,
+          int offA,
+          __global const Dtype * B,
+          int offB,
+          __global Dtype * C,
+          int offC,
+          int M,
+          int N,
+          int K,
+          KERNEL_ARG_DTYPE alpha_f,
+          KERNEL_ARG_DTYPE beta_f)
+{
+  Dtype alpha = (Dtype)alpha_f;
+  Dtype beta = (Dtype)beta_f;
+  int x_gid = get_group_id(0);
+  int lid = get_local_id(0);
+
+  const __global Dtype *srca_read0 = A + offA;
+  const __global Dtype *srca_read1 = srca_read0 + K;
+
+  const __global Dtype *srcb_read = B + x_gid * 4 * K + offB;
+
+  __global Dtype4 *dstc0 = (__global Dtype4*)(C + offC);
+  __global Dtype4 *dstc1 = (__global Dtype4*)((__global Dtype*)(dstc0) + N);
+
+  __local Dtype4 work0[SLM_SIZE];
+  __local Dtype4 work1[SLM_SIZE];
+  __local Dtype* work_each0 = (__local Dtype*)work0;
+  __local Dtype* work_each1 = (__local Dtype*)work1;
+
+  if(x_gid == N / 4) {
+    TEMPLATE(gemm_buffer_NT_M_2_edgerows,Dtype) \
+         (srca_read0, srca_read1, srcb_read, work0, work1, N, K, x_gid, lid, alpha, beta, (__global Dtype*)dstc0, (__global Dtype*)dstc1);
+  } else {
+    Dtype4 dot0[4] = {(Dtype4)(0.), (Dtype4)(0.), (Dtype4)(0.), (Dtype4)(0.)};
+    Dtype4 dot1[4] = {(Dtype4)(0.), (Dtype4)(0.), (Dtype4)(0.), (Dtype4)(0.)};
+    int i = lid;
+    while( i < K / 4) {
+      const Dtype4 b0 = vload4(i, srca_read0);
+      const Dtype4 b1 = vload4(i, srca_read1);
+#pragma unroll
+      for(int j = 0; j < 4; ++j) {
+        Dtype4 a = vload4(i, srcb_read + j * K);
+        dot0[j] += b0 * a;
+        dot1[j] += b1 * a;
+      }
+      i += get_local_size(0);
+    }
+
+#pragma unroll
+    for(int j = 0; j < 4; ++j) {
+      work_each0[lid * 4 + j] = dot0[j].x + dot0[j].y + dot0[j].z + dot0[j].w;
+      work_each1[lid * 4 + j] = dot1[j].x + dot1[j].y + dot1[j].z + dot1[j].w;
+    }
+
+    if(i == K / 4) {
+      short tail_items = K % 4;
+      if(tail_items != 0) {
+        const __global Dtype *srcb_tail = srcb_read + i * 4;
+
+        const __global Dtype *srca_tail0 = srca_read0 + i * 4;
+        const __global Dtype *srca_tail1 = srca_read1 + i * 4;
+#pragma unroll
+        for(short i = 0; i < tail_items; ++i) {
+          const Dtype at0 = srca_tail0[i];
+          const Dtype at1 = srca_tail1[i];
+#pragma unroll
+          for(int j = 0; j < 4; ++j) {
+            work_each0[lid * 4 + j] += at0 * srcb_tail[i + j * K];
+            work_each1[lid * 4 + j] += at1 * srcb_tail[i + j * K];
+          }
+        }
+      }
+    }
+
+    for(int stride = get_local_size(0) >> 1; stride > 0 ; stride >>= 1) {
+      barrier(CLK_LOCAL_MEM_FENCE);
+      if(lid < stride) {
+        work0[lid] += work0[lid+stride];
+        work1[lid] += work1[lid+stride];
+      }
+    }
+
+    if(lid == 0) {
+      dstc0[x_gid] = alpha * work0[0] + beta * dstc0[x_gid];
+      dstc1[x_gid] = alpha * work1[0] + beta * dstc1[x_gid];
+    }
+  }
+}
+#undef SLM_SIZE
+
+#define SLM_SIZE 32
+void TEMPLATE(gemm_buffer_NT_M_4_edgerows,Dtype)(
+                           const __global Dtype* srca_read0,
+                           const __global Dtype* srca_read1,
+                           const __global Dtype* srca_read2,
+                           const __global Dtype* srca_read3,
+                           const __global Dtype* srcb_read,
+                           __local Dtype4* work0,
+                           __local Dtype4* work1,
+                           __local Dtype4* work2,
+                           __local Dtype4* work3,
+                           int N,
+                           int K,
+                           int x_gid,
+                           int lid,
+                           Dtype alpha,
+                           Dtype beta,
+                           __global Dtype* dstc0,
+                           __global Dtype* dstc1,
+                           __global Dtype* dstc2,
+                           __global Dtype* dstc3)
+{
+  __local Dtype* work_each0 = (__local Dtype*)(work0 + lid);
+  __local Dtype* work_each1 = (__local Dtype*)(work1 + lid);
+  __local Dtype* work_each2 = (__local Dtype*)(work2 + lid);
+  __local Dtype* work_each3 = (__local Dtype*)(work3 + lid);
+
+  int rows = N - x_gid * 4;
+
+  Dtype4 dot0[3] = {(Dtype4)(0.), (Dtype4)(0.), (Dtype4)(0.)};
+  Dtype4 dot1[3] = {(Dtype4)(0.), (Dtype4)(0.), (Dtype4)(0.)};
+  Dtype4 dot2[3] = {(Dtype4)(0.), (Dtype4)(0.), (Dtype4)(0.)};
+  Dtype4 dot3[3] = {(Dtype4)(0.), (Dtype4)(0.), (Dtype4)(0.)};
+
+  int i = lid;
+  while( i < K / 4) {
+    const Dtype4 a0 = {srca_read0[i*4], srca_read0[(i*4+1)], srca_read0[(i*4+2)], srca_read0[(i*4+3)]};
+    const Dtype4 a1 = {srca_read1[i*4], srca_read1[(i*4+1)], srca_read1[(i*4+2)], srca_read1[(i*4+3)]};
+    const Dtype4 a2 = {srca_read2[i*4], srca_read2[(i*4+1)], srca_read2[(i*4+2)], srca_read2[(i*4+3)]};
+    const Dtype4 a3 = {srca_read3[i*4], srca_read3[(i*4+1)], srca_read3[(i*4+2)], srca_read3[(i*4+3)]};
+#pragma unrol
+    for(int j = 0; j < rows; ++j) {
+      dot0[j] += a0 * vload4(i, srcb_read + j * K);
+      dot1[j] += a1 * vload4(i, srcb_read + j * K);
+      dot2[j] += a2 * vload4(i, srcb_read + j * K);
+      dot3[j] += a3 * vload4(i, srcb_read + j * K);
+    }
+
+    i += get_local_size(0);
+  }
+#pragma unroll
+  for(int j = 0; j < rows; ++j) {
+    work_each0[j] = dot0[j].x + dot0[j].y + dot0[j].z + dot0[j].w;
+    work_each1[j] = dot1[j].x + dot1[j].y + dot1[j].z + dot1[j].w;
+    work_each2[j] = dot2[j].x + dot2[j].y + dot2[j].z + dot2[j].w;
+    work_each3[j] = dot3[j].x + dot3[j].y + dot3[j].z + dot3[j].w;
+  }
+
+  if(i == K / 4) {
+    short tail_items = K % 4;
+
+    if(tail_items != 0) {
+      const __global Dtype *srcb_tail = srcb_read + i * 4;
+
+      const __global Dtype *srca_tail0 = srca_read0 + i * 4;
+      const __global Dtype *srca_tail1 = srca_read1 + i * 4;
+      const __global Dtype *srca_tail2 = srca_read2 + i * 4;
+      const __global Dtype *srca_tail3 = srca_read3 + i * 4;
+#pragma unroll
+      for(short i = 0; i < tail_items; ++i) {
+        const Dtype at0 = srca_tail0[i];
+        const Dtype at1 = srca_tail1[i];
+        const Dtype at2 = srca_tail2[i];
+        const Dtype at3 = srca_tail3[i];
+#pragma unroll
+        for(int j = 0; j < rows; ++j) {
+          work_each0[j] += at0 * srcb_tail[i + j * K];
+          work_each1[j] += at1 * srcb_tail[i + j * K];
+          work_each2[j] += at2 * srcb_tail[i + j * K];
+          work_each3[j] += at3 * srcb_tail[i + j * K];
+        }
+      }
+    }
+  }
+
+  for(int stride = get_local_size(0) >> 1; stride > 0 ; stride >>= 1) {
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if(lid < stride) {
+      work0[lid] += work0[lid+stride];
+      work1[lid] += work1[lid+stride];
+      work2[lid] += work2[lid+stride];
+      work3[lid] += work3[lid+stride];
+    }
+  }
+
+  if(lid == 0) {
+#pragma unroll
+    for(int j = 0; j < rows; ++j) {
+      dstc0[(x_gid * 4  + j)] = alpha * work_each0[j] + beta * dstc0[(x_gid * 4 + j)];
+      dstc1[(x_gid * 4  + j)] = alpha * work_each1[j] + beta * dstc1[(x_gid * 4 + j)];
+      dstc2[(x_gid * 4  + j)] = alpha * work_each2[j] + beta * dstc2[(x_gid * 4 + j)];
+      dstc3[(x_gid * 4  + j)] = alpha * work_each3[j] + beta * dstc3[(x_gid * 4 + j)];
+    }
+  }
+}
+
+__kernel void TEMPLATE(gemm_buffer_NT_M_4,Dtype)(
+          __global const Dtype * A,
+          int offA,
+          __global const Dtype * B,
+          int offB,
+          __global Dtype * C,
+          int offC,
+          int M,
+          int N,
+          int K,
+          KERNEL_ARG_DTYPE alpha_f,
+          KERNEL_ARG_DTYPE beta_f)
+{
+  Dtype alpha = (Dtype)alpha_f;
+  Dtype beta = (Dtype)beta_f;
+  int x_gid = get_group_id(0);
+  int lid = get_local_id(0);
+  int lsize = get_local_size(0);
+
+  const __global Dtype *srca_read0 = A + offA;
+  const __global Dtype *srca_read1 = srca_read0 + K;
+  const __global Dtype *srca_read2 = srca_read1 + K;
+  const __global Dtype *srca_read3 = srca_read2 + K;
+
+  const __global Dtype *srcb_read = B + x_gid * 4 * K + offB;
+
+  __global Dtype4 *dstc0 = (__global Dtype4*)(C + offC);
+  __global Dtype4 *dstc1 = (__global Dtype4*)((__global Dtype*)(dstc0) + N);
+  __global Dtype4 *dstc2 = (__global Dtype4*)((__global Dtype*)(dstc1) + N);
+  __global Dtype4 *dstc3 = (__global Dtype4*)((__global Dtype*)(dstc2) + N);
+
+  __local Dtype4 work0[SLM_SIZE];
+  __local Dtype4 work1[SLM_SIZE];
+  __local Dtype4 work2[SLM_SIZE];
+  __local Dtype4 work3[SLM_SIZE];
+  __local Dtype* work_each0 = (__local Dtype*)(work0 + lid);
+  __local Dtype* work_each1 = (__local Dtype*)(work1 + lid);
+  __local Dtype* work_each2 = (__local Dtype*)(work2 + lid);
+  __local Dtype* work_each3 = (__local Dtype*)(work3 + lid);
+
+  if(x_gid == N / 4) {
+    TEMPLATE(gemm_buffer_NT_M_4_edgerows,Dtype) \
+         (srca_read0, srca_read1, srca_read2, srca_read3, srcb_read, \
+         work0, work1, work2, work3, N, K, x_gid, lid, alpha, beta, \
+         (__global Dtype*)dstc0, (__global Dtype*)dstc1, (__global Dtype*)dstc2, (__global Dtype*)dstc3);
+  } else {
+    Dtype4 dot0[4] = {(Dtype4)(0.), (Dtype4)(0.), (Dtype4)(0.), (Dtype4)(0.)};
+    Dtype4 dot1[4] = {(Dtype4)(0.), (Dtype4)(0.), (Dtype4)(0.), (Dtype4)(0.)};
+    Dtype4 dot2[4] = {(Dtype4)(0.), (Dtype4)(0.), (Dtype4)(0.), (Dtype4)(0.)};
+    Dtype4 dot3[4] = {(Dtype4)(0.), (Dtype4)(0.), (Dtype4)(0.), (Dtype4)(0.)};
+
+    int kid = lid;
+    while( kid < K / 4) {
+      const Dtype4 b0 = vload4(kid, srca_read0);
+      const Dtype4 b1 = vload4(kid, srca_read1);
+      const Dtype4 b2 = vload4(kid, srca_read2);
+      const Dtype4 b3 = vload4(kid, srca_read3);
+#pragma unroll
+      for(int j = 0; j < 4; ++j) {
+        Dtype4 a = vload4(kid, srcb_read + j * K);
+        dot0[j] += b0 * a;
+        dot1[j] += b1 * a;
+        dot2[j] += b2 * a;
+        dot3[j] += b3 * a;
+      }
+      kid += lsize;
+    }
+#pragma unroll
+    for(int j = 0; j < 4; ++j) {
+      work_each0[j] = dot0[j].x + dot0[j].y + dot0[j].z + dot0[j].w;
+      work_each1[j] = dot1[j].x + dot1[j].y + dot1[j].z + dot1[j].w;
+      work_each2[j] = dot2[j].x + dot2[j].y + dot2[j].z + dot2[j].w;
+      work_each3[j] = dot3[j].x + dot3[j].y + dot3[j].z + dot3[j].w;
+    }
+
+    if(kid == (K >> 2)) {
+      short tail_items = K % 4;
+      if(tail_items != 0) {
+        int offset = kid << 2;
+        const __global Dtype *srcb_tail = srcb_read + offset;
+
+        const __global Dtype *srca_tail0 = srca_read0 + offset;
+        const __global Dtype *srca_tail1 = srca_read1 + offset;
+        const __global Dtype *srca_tail2 = srca_read2 + offset;
+        const __global Dtype *srca_tail3 = srca_read3 + offset;
+#pragma unroll
+        for(short i = 0; i < tail_items; ++i) {
+          const Dtype at0 = srca_tail0[i];
+          const Dtype at1 = srca_tail1[i];
+          const Dtype at2 = srca_tail2[i];
+          const Dtype at3 = srca_tail3[i];
+#pragma unroll
+          for(int j = 0; j < 4; ++j) {
+            work_each0[j] += at0 * srcb_tail[i + j * K];
+            work_each1[j] += at1 * srcb_tail[i + j * K];
+            work_each2[j] += at2 * srcb_tail[i + j * K];
+            work_each3[j] += at3 * srcb_tail[i + j * K];
+          }
+        }
+      }
+    }
+
+    for(int stride = get_local_size(0) >> 1; stride > 0 ; stride >>= 1) {
+      barrier(CLK_LOCAL_MEM_FENCE);
+      if(lid < stride) {
+        work0[lid] += work0[lid+stride];
+        work1[lid] += work1[lid+stride];
+        work2[lid] += work2[lid+stride];
+        work3[lid] += work3[lid+stride];
+      }
+    }
+
+    if(lid == 0) {
+      dstc0[x_gid] = alpha * work0[0] + beta * dstc0[x_gid];
+      dstc1[x_gid] = alpha * work1[0] + beta * dstc1[x_gid];
+      dstc2[x_gid] = alpha * work2[0] + beta * dstc2[x_gid];
+      dstc3[x_gid] = alpha * work3[0] + beta * dstc3[x_gid];
+    }
+  }
+}
+#undef SLM_SIZE
+
+#define SLM_SIZE 16
+__kernel void TEMPLATE(gemm_buffer_NT_M_8,Dtype)(
+          __global const Dtype * A,
+          int offA,
+          __global const Dtype * B,
+          int offB,
+          __global Dtype * C,
+          int offC,
+          int M,
+          int N,
+          int K,
+          KERNEL_ARG_DTYPE alpha_f,
+          KERNEL_ARG_DTYPE beta_f)
+{
+  Dtype alpha = (Dtype)alpha_f;
+  Dtype beta = (Dtype)beta_f;
+  int x_gid = get_group_id(0);
+  int lid = get_local_id(0);
+  int lsize = get_local_size(0);
+
+  const __global Dtype *srca_read0 = A + offA;
+  const __global Dtype *srca_read1 = srca_read0 + K;
+  const __global Dtype *srca_read2 = srca_read1 + K;
+  const __global Dtype *srca_read3 = srca_read2 + K;
+  const __global Dtype *srca_read4 = srca_read3 + K;
+  const __global Dtype *srca_read5 = srca_read4 + K;
+  const __global Dtype *srca_read6 = srca_read5 + K;
+  const __global Dtype *srca_read7 = srca_read6 + K;
+
+  const __global Dtype *srcb_read = B + x_gid * K + offB;
+
+  __global Dtype *dstc0 = C + offC;
+  __global Dtype *dstc1 = dstc0 + N;
+  __global Dtype *dstc2 = dstc1 + N;
+  __global Dtype *dstc3 = dstc2 + N;
+  __global Dtype *dstc4 = dstc3 + N;
+  __global Dtype *dstc5 = dstc4 + N;
+  __global Dtype *dstc6 = dstc5 + N;
+  __global Dtype *dstc7 = dstc6 + N;
+
+  __local Dtype work0[SLM_SIZE];
+  __local Dtype work1[SLM_SIZE];
+  __local Dtype work2[SLM_SIZE];
+  __local Dtype work3[SLM_SIZE];
+  __local Dtype work4[SLM_SIZE];
+  __local Dtype work5[SLM_SIZE];
+  __local Dtype work6[SLM_SIZE];
+  __local Dtype work7[SLM_SIZE];
+
+  Dtype4 dot0 = (Dtype4)(0.);
+  Dtype4 dot1 = (Dtype4)(0.);
+  Dtype4 dot2 = (Dtype4)(0.);
+  Dtype4 dot3 = (Dtype4)(0.);
+  Dtype4 dot4 = (Dtype4)(0.);
+  Dtype4 dot5 = (Dtype4)(0.);
+  Dtype4 dot6 = (Dtype4)(0.);
+  Dtype4 dot7 = (Dtype4)(0.);
+
+  int kid = lid;
+  while( kid < K / 4) {
+    const Dtype4 a0 = vload4(kid, srca_read0);
+    const Dtype4 a1 = vload4(kid, srca_read1);
+    const Dtype4 a2 = vload4(kid, srca_read2);
+    const Dtype4 a3 = vload4(kid, srca_read3);
+    const Dtype4 a4 = vload4(kid, srca_read4);
+    const Dtype4 a5 = vload4(kid, srca_read5);
+    const Dtype4 a6 = vload4(kid, srca_read6);
+    const Dtype4 a7 = vload4(kid, srca_read7);
+    Dtype4 b = vload4(kid, srcb_read);
+    dot0 += a0 * b;
+    dot1 += a1 * b;
+    dot2 += a2 * b;
+    dot3 += a3 * b;
+    dot4 += a4 * b;
+    dot5 += a5 * b;
+    dot6 += a6 * b;
+    dot7 += a7 * b;
+
+    kid += lsize;
+  }
+  work0[lid] = dot0.x + dot0.y + dot0.z + dot0.w;
+  work1[lid] = dot1.x + dot1.y + dot1.z + dot1.w;
+  work2[lid] = dot2.x + dot2.y + dot2.z + dot2.w;
+  work3[lid] = dot3.x + dot3.y + dot3.z + dot3.w;
+  work4[lid] = dot4.x + dot4.y + dot4.z + dot4.w;
+  work5[lid] = dot5.x + dot5.y + dot5.z + dot5.w;
+  work6[lid] = dot6.x + dot6.y + dot6.z + dot6.w;
+  work7[lid] = dot7.x + dot7.y + dot7.z + dot7.w;
+
+  if(kid == (K >> 2)) {
+    short tail_items = K % 4;
+    if(tail_items != 0) {
+      int offset = kid << 2;
+      const __global Dtype *srcb_tail = srcb_read + offset;
+
+      const __global Dtype *srca_tail0 = srca_read0 + offset;
+      const __global Dtype *srca_tail1 = srca_read1 + offset;
+      const __global Dtype *srca_tail2 = srca_read2 + offset;
+      const __global Dtype *srca_tail3 = srca_read3 + offset;
+      const __global Dtype *srca_tail4 = srca_read4 + offset;
+      const __global Dtype *srca_tail5 = srca_read5 + offset;
+      const __global Dtype *srca_tail6 = srca_read6 + offset;
+      const __global Dtype *srca_tail7 = srca_read7 + offset;
+#pragma unroll
+      for(short item = 0; item < tail_items; ++item) {
+        work0[lid] += srca_tail0[item] * srcb_tail[item];
+        work1[lid] += srca_tail1[item] * srcb_tail[item];
+        work2[lid] += srca_tail2[item] * srcb_tail[item];
+        work3[lid] += srca_tail3[item] * srcb_tail[item];
+        work4[lid] += srca_tail4[item] * srcb_tail[item];
+        work5[lid] += srca_tail5[item] * srcb_tail[item];
+        work6[lid] += srca_tail6[item] * srcb_tail[item];
+        work7[lid] += srca_tail7[item] * srcb_tail[item];
+      }
+    }
+  }
+
+  for(int stride = get_local_size(0) >> 1; stride > 0 ; stride >>= 1) {
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if(lid < stride) {
+      work0[lid] += work0[lid+stride];
+      work1[lid] += work1[lid+stride];
+      work2[lid] += work2[lid+stride];
+      work3[lid] += work3[lid+stride];
+      work4[lid] += work4[lid+stride];
+      work5[lid] += work5[lid+stride];
+      work6[lid] += work6[lid+stride];
+      work7[lid] += work7[lid+stride];
+    }
+  }
+
+  if(lid == 0) {
+    dstc0[x_gid] = alpha * work0[0] + beta * dstc0[x_gid];
+    dstc1[x_gid] = alpha * work1[0] + beta * dstc1[x_gid];
+    dstc2[x_gid] = alpha * work2[0] + beta * dstc2[x_gid];
+    dstc3[x_gid] = alpha * work3[0] + beta * dstc3[x_gid];
+    dstc4[x_gid] = alpha * work4[0] + beta * dstc4[x_gid];
+    dstc5[x_gid] = alpha * work5[0] + beta * dstc5[x_gid];
+    dstc6[x_gid] = alpha * work6[0] + beta * dstc6[x_gid];
+    dstc7[x_gid] = alpha * work7[0] + beta * dstc7[x_gid];
+  }
+}
+#undef SLM_SIZE
+
+#undef VEC_SIZE
+#undef LWG_HEIGHT
+#undef TILE_M
+#undef TILE_K
+#undef TILE_N
+#undef SIMD_SIZE_GEMM
+#undef SHUFFLE_TYPE2
+#undef SHUFFLE_TYPE8
diff --git a/modules/dnn/src/opencl/gemm_image.cl b/modules/dnn/src/opencl/gemm_image.cl
index 37ae523a21..710637a093 100644
--- a/modules/dnn/src/opencl/gemm_image.cl
+++ b/modules/dnn/src/opencl/gemm_image.cl
@@ -39,24 +39,42 @@
 //
 //M*/
 
+#if defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
+
 #define CONCAT(A,B) A##_##B
 #define TEMPLATE(name,type) CONCAT(name,type)
 
-// Types used for parameters, offset computations and so on
-#define int_tp int
-#define uint_tp unsigned int
-
+#define KERNEL_ARG_DTYPE float
+#define TYPE_FLOAT  1
+#define TYPE_HALF   2
+
+#if TYPE == TYPE_HALF
+#define Dtype  half
+#define Dtype2 half2
+#define Dtype4 half4
+#define Dtype8 half8
+#define Dtype16 half16
+
+#define as_Dtype  as_half
+#define as_Dtype2 as_half2
+#define as_Dtype4 as_half4
+#define as_Dtype8 as_half8
+#define as_Dtype16 as_half16
+#else
 #define Dtype  float
 #define Dtype2 float2
 #define Dtype4 float4
 #define Dtype8 float8
+#define Dtype16 float16
 
 #define as_Dtype  as_float
 #define as_Dtype2 as_float2
 #define as_Dtype4 as_float4
 #define as_Dtype8 as_float8
-
-#define KERNEL_ARG_DTYPE float
+#define as_Dtype16 as_float16
+#endif
 
 #if defined(cl_intel_subgroups)
 #pragma OPENCL EXTENSION  cl_intel_subgroups : enable
@@ -67,6 +85,15 @@
 
 // common block to calculate (alpha * AxB + beta * C) and output to destination image.
 
+#if TYPE == TYPE_HALF
+#define SUBGROUP_BLOCK_READ8( __image, __coord ) intel_sub_group_block_read_us8( __image, __coord )
+#define SHUFFLE_TYPE2(val) as_ushort2(val)
+#define SHUFFLE_TYPE8(val) as_ushort8(val)
+#define READ_IMAGE(__image, __coord) read_imageh(__image, sampler, __coord)
+#define SIZE_OF_ELEMENT sizeof(ushort)
+#define SIMD_SIZE_GEMM 16
+#define TILE_N 16
+#else
 #define SUBGROUP_BLOCK_READ8( __image, __coord ) intel_sub_group_block_read8( __image, __coord )
 #define SHUFFLE_TYPE2(val) val
 #define SHUFFLE_TYPE8(val) val
@@ -74,11 +101,17 @@
 #define SIZE_OF_ELEMENT sizeof(uint)
 #define SIMD_SIZE_GEMM 8
 #define TILE_N 8
+#endif
 
 //#define USE_IMAGE_C
 #ifdef USE_IMAGE_C
+#if TYPE == TYPE_HALF
+#define BLOCKC_READ8( _C, _coordC ) as_Dtype8( intel_sub_group_block_read_us8( _C, _coordC ) )
+#define BLOCKC_WRITE8( _C, _coordC, _val ) intel_sub_group_block_write_us8( _C, _coordC, as_ushort8( _val ) )
+#else
 #define BLOCKC_READ8( _C, _coordC ) as_Dtype8( intel_sub_group_block_read8( _C, _coordC ) )
 #define BLOCKC_WRITE8( _C, _coordC, _val ) intel_sub_group_block_write8( _C, _coordC, as_uint8( _val ) )
+#endif
 #define MATC_PARAMETER __read_only image2d_t C, __write_only image2d_t dst
 #define GEMM_OUTPUT(ALPHA1, BETA_NOT0) GEMM_OUTPUT_EXT(ALPHA1, BETA_NOT0, C, dst, sizeof(uint))
 #else
@@ -139,10 +172,10 @@
             blockC03 += blockAxB03; \
         } \
     } else { \
-        blockC00 = isFirstColBlock ? BLOCKC_READ8( _C, coordC ) * beta : BLOCKC_READ8( _C, coordC );    coordC.y += 8; \
-        blockC01 = isFirstColBlock ? BLOCKC_READ8( _C, coordC ) * beta : BLOCKC_READ8( _C, coordC );    coordC.y += 8; \
-        blockC02 = isFirstColBlock ? BLOCKC_READ8( _C, coordC ) * beta : BLOCKC_READ8( _C, coordC );    coordC.y += 8; \
-        blockC03 = isFirstColBlock ? BLOCKC_READ8( _C, coordC ) * beta : BLOCKC_READ8( _C, coordC ); \
+        blockC00 = isFirstColBlock ? (Dtype)0. : BLOCKC_READ8( _C, coordC );    coordC.y += 8; \
+        blockC01 = isFirstColBlock ? (Dtype)0. : BLOCKC_READ8( _C, coordC );    coordC.y += 8; \
+        blockC02 = isFirstColBlock ? (Dtype)0. : BLOCKC_READ8( _C, coordC );    coordC.y += 8; \
+        blockC03 = isFirstColBlock ? (Dtype)0. : BLOCKC_READ8( _C, coordC ); \
         if (!ALPHA1) { \
           blockC00 = mad(blockAxB00, (Dtype8)alpha, blockC00); \
           blockC01 = mad(blockAxB01, (Dtype8)alpha, blockC01); \
@@ -172,6 +205,43 @@
                   intel_sub_group_shuffle( _block.s7, _col ) );
 
 // A's column block multiply B 's row block.
+#if TYPE == TYPE_HALF
+#define MULTIPLY_BLOCKS_8x8( _result, _blockA, _blockB00, _blockB01 )    \
+        {   \
+            const Dtype8    acol0 = TRANSPOSE_BLOCK_8( _blockA, 0 );    \
+            const Dtype8    acol1 = TRANSPOSE_BLOCK_8( _blockA, 1 );    \
+            const Dtype8    acol2 = TRANSPOSE_BLOCK_8( _blockA, 2 );    \
+            const Dtype8    acol3 = TRANSPOSE_BLOCK_8( _blockA, 3 );    \
+            const Dtype8    acol4 = TRANSPOSE_BLOCK_8( _blockA, 4 );    \
+            const Dtype8    acol5 = TRANSPOSE_BLOCK_8( _blockA, 5 );    \
+            const Dtype8    acol6 = TRANSPOSE_BLOCK_8( _blockA, 6 );    \
+            const Dtype8    acol7 = TRANSPOSE_BLOCK_8( _blockA, 7 );    \
+            const Dtype8    acol8 = TRANSPOSE_BLOCK_8( _blockA, 8 );    \
+            const Dtype8    acol9 = TRANSPOSE_BLOCK_8( _blockA, 9 );    \
+            const Dtype8    acola = TRANSPOSE_BLOCK_8( _blockA, 10 );    \
+            const Dtype8    acolb = TRANSPOSE_BLOCK_8( _blockA, 11 );    \
+            const Dtype8    acolc = TRANSPOSE_BLOCK_8( _blockA, 12 );    \
+            const Dtype8    acold = TRANSPOSE_BLOCK_8( _blockA, 13 );    \
+            const Dtype8    acole = TRANSPOSE_BLOCK_8( _blockA, 14 );    \
+            const Dtype8    acolf = TRANSPOSE_BLOCK_8( _blockA, 15 );    \
+            _result = mad( (Dtype8)(_blockB00.s0), acol0, _result );      \
+            _result = mad( (Dtype8)(_blockB00.s1), acol1, _result );      \
+            _result = mad( (Dtype8)(_blockB00.s2), acol2, _result );      \
+            _result = mad( (Dtype8)(_blockB00.s3), acol3, _result );      \
+            _result = mad( (Dtype8)(_blockB00.s4), acol4, _result );      \
+            _result = mad( (Dtype8)(_blockB00.s5), acol5, _result );      \
+            _result = mad( (Dtype8)(_blockB00.s6), acol6, _result );      \
+            _result = mad( (Dtype8)(_blockB00.s7), acol7, _result );      \
+            _result = mad( (Dtype8)(_blockB01.s0), acol8, _result );      \
+            _result = mad( (Dtype8)(_blockB01.s1), acol9, _result );      \
+            _result = mad( (Dtype8)(_blockB01.s2), acola, _result );      \
+            _result = mad( (Dtype8)(_blockB01.s3), acolb, _result );      \
+            _result = mad( (Dtype8)(_blockB01.s4), acolc, _result );      \
+            _result = mad( (Dtype8)(_blockB01.s5), acold, _result );      \
+            _result = mad( (Dtype8)(_blockB01.s6), acole, _result );      \
+            _result = mad( (Dtype8)(_blockB01.s7), acolf, _result );      \
+        }
+#else
 #define MULTIPLY_BLOCKS_8x8( _result, _blockA, _blockB )    \
         {   \
             const Dtype8    acol0 = TRANSPOSE_BLOCK_8( _blockA, 0 );    \
@@ -191,7 +261,50 @@
             _result = mad( (Dtype8)(_blockB.s6), acol6, _result );      \
             _result = mad( (Dtype8)(_blockB.s7), acol7, _result );      \
         }
+#endif
 
+#if TYPE == TYPE_HALF
+#define GEMM_NN(ALPHA1, BETA_NOT0) \
+__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \
+__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \
+__kernel void TEMPLATE(gemm_32_1_NN_ ##ALPHA1 ##_ ##BETA_NOT0, Dtype)( \
+    __read_only image2d_t A, \
+    __read_only image2d_t B, \
+    MATC_PARAMETER, \
+    KERNEL_ARG_DTYPE alpha_in, \
+    KERNEL_ARG_DTYPE beta_in, \
+    int width0, \
+    int isFirstColBlock) \
+{ \
+    const Dtype alpha = (Dtype)alpha_in; \
+    const Dtype beta = (Dtype)beta_in; \
+    const int group_x = get_group_id(0); \
+    const int group_y = get_group_id(1); \
+    Dtype8 blockAxB00 = 0; \
+    Dtype8 blockAxB01 = 0; \
+    Dtype8 blockAxB02 = 0; \
+    Dtype8 blockAxB03 = 0; \
+    int2    coordA = (int2)( 0, group_y * TILE_M ); \
+    int2    coordB = (int2)( ( group_x * TILE_N ) * SIZE_OF_ELEMENT, 0 ); \
+    do \
+    {  \
+        int2    coordBTemp = coordB; \
+        Dtype8  blockB00 = as_Dtype8( SUBGROUP_BLOCK_READ8( B, coordBTemp ) );    coordB.y += TILE_K; \
+        Dtype8  blockB01 = as_Dtype8( SUBGROUP_BLOCK_READ8( B, coordBTemp ) );    coordB.y += TILE_K; \
+        int2    coordATemp = coordA; \
+        Dtype8  blockA00 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.y += 8; \
+        Dtype8  blockA01 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.y += 8; \
+        Dtype8  blockA02 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.y += 8; \
+        Dtype8  blockA03 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordA.x += TILE_K * SIZE_OF_ELEMENT * 2; \
+        MULTIPLY_BLOCKS_8x8( blockAxB00, blockA00, blockB00, blockB01 ); \
+        MULTIPLY_BLOCKS_8x8( blockAxB01, blockA01, blockB00, blockB01 ); \
+        MULTIPLY_BLOCKS_8x8( blockAxB02, blockA02, blockB00, blockB01 ); \
+        MULTIPLY_BLOCKS_8x8( blockAxB03, blockA03, blockB00, blockB01 ); \
+    } \
+    while( coordB.y < width0 ); \
+    GEMM_OUTPUT(ALPHA1, BETA_NOT0);  \
+}
+#else
 #define GEMM_NN(ALPHA1, BETA_NOT0) \
 __attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \
 __attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \
@@ -231,6 +344,7 @@ __kernel void TEMPLATE(gemm_32_1_NN_ ##ALPHA1 ##_ ##BETA_NOT0, Dtype)( \
     while( coordB.y < width0 ); \
     GEMM_OUTPUT(ALPHA1, BETA_NOT0); \
 }
+#endif
 
 GEMM_NN(1, 0) // ALPHA == 1, BETA == 0
 GEMM_NN(1, 1) // ALPHA == 1, BETA != 0
@@ -264,6 +378,45 @@ GEMM_NN(0, 1) // ALPHA != 1, BETA != 0
             _result = mad( (Dtype8)(_blockB.s7), TRANSPOSE_BLOCK_8(_blockA.s7, _col), _result );      \
         }
 
+#if TYPE == TYPE_HALF
+#define GEMM_TN(ALPHA1, BETA_NOT0) \
+__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \
+__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \
+__kernel void TEMPLATE(gemm_32_1_TN_ ##ALPHA1 ##_ ##BETA_NOT0,Dtype)( \
+    __read_only image2d_t A, \
+    __read_only image2d_t B, \
+    MATC_PARAMETER, \
+    KERNEL_ARG_DTYPE alpha_in, \
+    KERNEL_ARG_DTYPE beta_in, \
+    int width0, \
+    int isFirstColBlock) \
+{ \
+    const Dtype alpha = (Dtype)alpha_in; \
+    const Dtype beta = (Dtype)beta_in; \
+    const int group_x = get_group_id(0);\
+    const int group_y = get_group_id(1);\
+    Dtype8 blockAxB00 = 0;\
+    Dtype8 blockAxB01 = 0;\
+    Dtype8 blockAxB02 = 0;\
+    Dtype8 blockAxB03 = 0;\
+    int2    coordA = (int2)( group_y * TILE_M * SIZE_OF_ELEMENT, 0 );\
+    int2    coordB = (int2)( ( group_x * TILE_N ) * SIZE_OF_ELEMENT, 0 );\
+    do\
+    {\
+        int2    coordBTemp = coordB;\
+        Dtype8 blockB00 = as_Dtype8( SUBGROUP_BLOCK_READ8( B, coordBTemp ) );    coordB.y += TILE_K;\
+        int2    coordATemp = coordA;\
+        Dtype8 blockA00 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.x += 16 * SIZE_OF_ELEMENT;\
+        Dtype8 blockA01 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordA.y += TILE_K;\
+        MULTIPLY_BLOCKS_8x8( blockAxB00, blockA00, blockB00, 0); \
+        MULTIPLY_BLOCKS_8x8( blockAxB01, blockA00, blockB00, 8); \
+        MULTIPLY_BLOCKS_8x8( blockAxB02, blockA01, blockB00, 0); \
+        MULTIPLY_BLOCKS_8x8( blockAxB03, blockA01, blockB00, 8); \
+    } \
+    while( coordB.y < width0 ); \
+    GEMM_OUTPUT(ALPHA1, BETA_NOT0); \
+}
+#else
 #define GEMM_TN(ALPHA1, BETA_NOT0) \
 __attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \
 __attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \
@@ -303,6 +456,7 @@ __kernel void TEMPLATE(gemm_32_1_TN_ ##ALPHA1 ##_ ##BETA_NOT0,Dtype)( \
     while( coordB.y < width0 ); \
     GEMM_OUTPUT(ALPHA1, BETA_NOT0); \
 }
+#endif
 
 GEMM_TN(1, 0) // ALPHA == 1, BETA == 0
 GEMM_TN(1, 1) // ALPHA == 1, BETA != 0
@@ -324,6 +478,7 @@ GEMM_TN(0, 1) // ALPHA != 1, BETA != 0
                   intel_sub_group_shuffle( _block.s6, _col),   \
                   intel_sub_group_shuffle( _block.s7, _col) )
 
+#if TYPE == TYPE_HALF
 #define MULTIPLY_BLOCKS_8x8( _result, _blockA, _blockB )    \
         {   \
             const Dtype8    acol0 = TRANSPOSE_BLOCK_8( _blockA, 0 );    \
@@ -334,6 +489,14 @@ GEMM_TN(0, 1) // ALPHA != 1, BETA != 0
             const Dtype8    acol5 = TRANSPOSE_BLOCK_8( _blockA, 5 );    \
             const Dtype8    acol6 = TRANSPOSE_BLOCK_8( _blockA, 6 );    \
             const Dtype8    acol7 = TRANSPOSE_BLOCK_8( _blockA, 7 );    \
+            const Dtype8    acol8 = TRANSPOSE_BLOCK_8( _blockA, 8 );    \
+            const Dtype8    acol9 = TRANSPOSE_BLOCK_8( _blockA, 9 );    \
+            const Dtype8    acola = TRANSPOSE_BLOCK_8( _blockA, 10 );    \
+            const Dtype8    acolb = TRANSPOSE_BLOCK_8( _blockA, 11 );    \
+            const Dtype8    acolc = TRANSPOSE_BLOCK_8( _blockA, 12 );    \
+            const Dtype8    acold = TRANSPOSE_BLOCK_8( _blockA, 13 );    \
+            const Dtype8    acole = TRANSPOSE_BLOCK_8( _blockA, 14 );    \
+            const Dtype8    acolf = TRANSPOSE_BLOCK_8( _blockA, 15 );    \
             _result = mad( (Dtype8)_blockB.s0, acol0, _result );      \
             _result = mad( (Dtype8)_blockB.s1, acol1, _result );      \
             _result = mad( (Dtype8)_blockB.s2, acol2, _result );      \
@@ -342,8 +505,80 @@ GEMM_TN(0, 1) // ALPHA != 1, BETA != 0
             _result = mad( (Dtype8)_blockB.s5, acol5, _result );      \
             _result = mad( (Dtype8)_blockB.s6, acol6, _result );      \
             _result = mad( (Dtype8)_blockB.s7, acol7, _result );      \
+            _result = mad( (Dtype8)_blockB.s8, acol8, _result );      \
+            _result = mad( (Dtype8)_blockB.s9, acol9, _result );      \
+            _result = mad( (Dtype8)_blockB.sa, acola, _result );      \
+            _result = mad( (Dtype8)_blockB.sb, acolb, _result );      \
+            _result = mad( (Dtype8)_blockB.sc, acolc, _result );      \
+            _result = mad( (Dtype8)_blockB.sd, acold, _result );      \
+            _result = mad( (Dtype8)_blockB.se, acole, _result );      \
+            _result = mad( (Dtype8)_blockB.sf, acolf, _result );      \
         }
+#else
+#define MULTIPLY_BLOCKS_8x8( _result, _blockA, _blockB )    \
+        {   \
+            const Dtype8    acol0 = TRANSPOSE_BLOCK_8( _blockA, 0 );    \
+            const Dtype8    acol1 = TRANSPOSE_BLOCK_8( _blockA, 1 );    \
+            const Dtype8    acol2 = TRANSPOSE_BLOCK_8( _blockA, 2 );    \
+            const Dtype8    acol3 = TRANSPOSE_BLOCK_8( _blockA, 3 );    \
+            const Dtype8    acol4 = TRANSPOSE_BLOCK_8( _blockA, 4 );    \
+            const Dtype8    acol5 = TRANSPOSE_BLOCK_8( _blockA, 5 );    \
+            const Dtype8    acol6 = TRANSPOSE_BLOCK_8( _blockA, 6 );    \
+            const Dtype8    acol7 = TRANSPOSE_BLOCK_8( _blockA, 7 );    \
+            _result = mad( (Dtype8)_blockB.s0, acol0, _result );      \
+            _result = mad( (Dtype8)_blockB.s1, acol1, _result );      \
+            _result = mad( (Dtype8)_blockB.s2, acol2, _result );      \
+            _result = mad( (Dtype8)_blockB.s3, acol3, _result );      \
+            _result = mad( (Dtype8)_blockB.s4, acol4, _result );      \
+            _result = mad( (Dtype8)_blockB.s5, acol5, _result );      \
+            _result = mad( (Dtype8)_blockB.s6, acol6, _result );      \
+            _result = mad( (Dtype8)_blockB.s7, acol7, _result );      \
+        }
+#endif
 
+#if TYPE == TYPE_HALF
+#define GEMM_NT(ALPHA1, BETA_NOT0, VECSCALAR, VECSIZE) \
+__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \
+__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \
+__kernel void TEMPLATE(gemm_32_1_NT_ ##VECSCALAR ##_ ##ALPHA1 ##_ ##BETA_NOT0,Dtype)( \
+    __read_only image2d_t A, \
+    MATB_PARAMETER, \
+    MATC_PARAMETER, \
+    KERNEL_ARG_DTYPE alpha_in, \
+    KERNEL_ARG_DTYPE beta_in, \
+    int padded_k, \
+    int k, \
+    int isFirstColBlock) \
+{ \
+    const Dtype alpha = (Dtype)alpha_in; \
+    const Dtype beta = (Dtype)beta_in; \
+    const int group_x = get_group_id(0); \
+    const int group_y = get_group_id(1); \
+    Dtype8 blockAxB00 = 0; \
+    Dtype8 blockAxB01 = 0; \
+    Dtype8 blockAxB02 = 0; \
+    Dtype8 blockAxB03 = 0; \
+    int2    coordA = (int2)( 0, group_y * TILE_M ); \
+    int2    coordB = (int2)( 0, ( group_x * TILE_N )); \
+    const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; \
+    do \
+    { \
+        Dtype16 blockB00; \
+        BLOCKB_READ8(blockB00, B, coordB); \
+        int2    coordATemp = coordA; \
+        Dtype8 blockA00 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.y += 8; \
+        Dtype8 blockA01 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.y += 8; \
+        Dtype8 blockA02 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.y += 8; \
+        Dtype8 blockA03 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordA.x += TILE_K * SIZE_OF_ELEMENT * 2; \
+        MULTIPLY_BLOCKS_8x8( blockAxB00, blockA00, blockB00 ); \
+        MULTIPLY_BLOCKS_8x8( blockAxB01, blockA01, blockB00 ); \
+        MULTIPLY_BLOCKS_8x8( blockAxB02, blockA02, blockB00 ); \
+        MULTIPLY_BLOCKS_8x8( blockAxB03, blockA03, blockB00 ); \
+    } \
+    while( coordB.x < padded_k / VECSIZE ); \
+    GEMM_OUTPUT(ALPHA1, BETA_NOT0); \
+}
+#else
 #define GEMM_NT(ALPHA1, BETA_NOT0, VECSCALAR, VECSIZE) \
 __attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \
 __attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \
@@ -385,12 +620,23 @@ __kernel void TEMPLATE(gemm_32_1_NT_ ##VECSCALAR ##_ ##ALPHA1 ##_ ##BETA_NOT0,Dt
     while( coordB.x < padded_k / VECSIZE ); \
     GEMM_OUTPUT(ALPHA1, BETA_NOT0); \
 }
+#endif
 
+#if TYPE == TYPE_HALF
+#define BLOCKB_READ8(_blockb, _B, _coordB) \
+        int2 _coordBTemp = _coordB; \
+        _coordBTemp.y += get_local_id(0); \
+        _blockb.s0123 = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s4567 = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s89ab = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.scdef = READ_IMAGE(_B, _coordBTemp); _coordB.x += 4;
+#else
 #define BLOCKB_READ8(_blockb, _B, _coordB) \
         int2 _coordBTemp = _coordB; \
         _coordBTemp.y += get_local_id(0); \
         _blockb.s0123 = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
         _blockb.s4567 = READ_IMAGE(_B, _coordBTemp); _coordB.x += 2;
+#endif
 
 #define MATB_PARAMETER __read_only image2d_t B
 
@@ -401,12 +647,21 @@ GEMM_NT(0, 1, VEC4, 4) // ALPHA != 1, BETA != 0
 #undef BLOCKB_READ8
 #undef MATB_PARAMETER
 
+#if TYPE == TYPE_HALF
+#define BLOCKB_READ8(_blockb, _B, _coordB) \
+        int2 _coordBTemp = _coordB; \
+        _coordBTemp.y += get_local_id(0); \
+        const __global float *B_read = (__global float *)(_B + (_coordBTemp.y * ldb) + _coordBTemp.x + offB); \
+        _blockb = as_Dtype16(as_ushort16(vload8(0, B_read))); \
+        _coordB.x += TILE_K * 2;
+#else
 #define BLOCKB_READ8(_blockb, _B, _coordB) \
         int2 _coordBTemp = _coordB; \
         _coordBTemp.y += get_local_id(0); \
         const __global Dtype *B_read = (__global Dtype *)(_B + (_coordBTemp.y * ldb) + _coordBTemp.x + offB); \
         _blockb = vload8(0, B_read); \
         _coordB.x += TILE_K;
+#endif
 
 #define MATB_PARAMETER __global Dtype *B, int offB, int ldb
 
@@ -417,6 +672,45 @@ GEMM_NT(0, 1, BUFFER, 1) // ALPHA != 1, BETA != 0
 #undef BLOCKB_READ8
 #undef MATB_PARAMETER
 
+#if TYPE == TYPE_HALF
+#define BLOCKB_READ8(_blockb, _B, _coordB) \
+        int2 _coordBTemp = _coordB; \
+        _coordBTemp.y += get_local_id(0); \
+        Dtype4 temp; \
+        temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s0 = temp.s0; \
+        temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s1 = temp.s0; \
+        temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s2 = temp.s0; \
+        temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s3 = temp.s0; \
+        temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s4 = temp.s0; \
+        temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s5 = temp.s0; \
+        temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s6 = temp.s0; \
+        temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s7 = temp.s0; \
+        temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s8 = temp.s0; \
+        temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s9 = temp.s0; \
+        temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.sa = temp.s0; \
+        temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.sb = temp.s0; \
+        temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+         _blockb.sc = temp.s0; \
+        temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.sd = temp.s0; \
+        temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.se = temp.s0; \
+        temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.sf = temp.s0; \
+        _coordB.x += 16;
+#else
 #define BLOCKB_READ8(_blockb, _B, _coordB) \
         int2 _coordBTemp = _coordB; \
         _coordBTemp.y += get_local_id(0); \
@@ -438,6 +732,7 @@ GEMM_NT(0, 1, BUFFER, 1) // ALPHA != 1, BETA != 0
         temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
         _blockb.s7 = temp.s0; \
         _coordB.x += 8;
+#endif
 
 #define MATB_PARAMETER __read_only image2d_t B
 
@@ -483,6 +778,47 @@ GEMM_NT(0, 1, SCALAR, 1) // ALPHA != 1, BETA != 0
             _result = mad( (Dtype8)_blockB.s7, acol7, _result );      \
         }
 
+#if TYPE == TYPE_HALF
+#define GEMM_TT(ALPHA1, BETA_NOT0, VECSCALAR, VECSIZE) \
+__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \
+__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \
+__kernel void TEMPLATE(gemm_32_1_TT_ ##VECSCALAR ##_ ##ALPHA1 ##_ ##BETA_NOT0, Dtype)( \
+    __read_only image2d_t A, \
+    MATB_PARAMETER, \
+    MATC_PARAMETER, \
+    KERNEL_ARG_DTYPE alpha_in, \
+    KERNEL_ARG_DTYPE beta_in, \
+    int padded_k, \
+    int k, \
+    int isFirstColBlock) \
+{ \
+    const Dtype alpha = (Dtype)alpha_in; \
+    const Dtype beta = (Dtype)beta_in; \
+    const int group_x = get_group_id(0); \
+    const int group_y = get_group_id(1); \
+    Dtype8 blockAxB00 = 0; \
+    Dtype8 blockAxB01 = 0; \
+    Dtype8 blockAxB02 = 0; \
+    Dtype8 blockAxB03 = 0; \
+    int2    coordA = (int2)( group_y * TILE_M * SIZE_OF_ELEMENT, 0 ); \
+    int2    coordB = (int2)( 0, ( group_x * TILE_N )); \
+    const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; \
+    do \
+    { \
+        Dtype8 blockB00;             \
+        BLOCKB_READ8(blockB00, B, coordB); \
+        int2    coordATemp = coordA; \
+        Dtype8 blockA00 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.x += 16 * SIZE_OF_ELEMENT;\
+        Dtype8 blockA01 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordA.y += TILE_K;\
+        MULTIPLY_BLOCKS_8x8( blockAxB00, blockA00, blockB00, 0); \
+        MULTIPLY_BLOCKS_8x8( blockAxB01, blockA00, blockB00, 8); \
+        MULTIPLY_BLOCKS_8x8( blockAxB02, blockA01, blockB00, 0); \
+        MULTIPLY_BLOCKS_8x8( blockAxB03, blockA01, blockB00, 8); \
+    } \
+    while( coordB.x < padded_k / VECSIZE ); \
+    GEMM_OUTPUT(ALPHA1, BETA_NOT0);\
+}
+#else
 #define GEMM_TT(ALPHA1, BETA_NOT0, VECSCALAR, VECSIZE) \
 __attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \
 __attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \
@@ -524,6 +860,7 @@ __kernel void TEMPLATE(gemm_32_1_TT_ ##VECSCALAR ##_ ##ALPHA1 ##_ ##BETA_NOT0, D
     while( coordB.x < padded_k / VECSIZE ); \
     GEMM_OUTPUT(ALPHA1, BETA_NOT0);\
 }
+#endif
 
 #define BLOCKB_READ8(_blockb, _B, _coordB) \
         int2 _coordBTemp = _coordB; \
@@ -540,12 +877,21 @@ GEMM_TT(0, 1, VEC4, 4) // ALPHA != 1, BETA != 0
 #undef BLOCKB_READ8
 #undef MATB_PARAMETER
 
+#if TYPE == TYPE_HALF
+#define BLOCKB_READ8(_blockb, _B, _coordB) \
+        int2 _coordBTemp = _coordB; \
+        _coordBTemp.y += get_local_id(0); \
+        const __global float *B_read = (__global float *)(_B + (_coordBTemp.y * k) + _coordBTemp.x + offB); \
+        _blockb = as_Dtype8(as_ushort8(vload4(0, B_read))); \
+        _coordB.x += TILE_K;
+#else
 #define BLOCKB_READ8(_blockb, _B, _coordB) \
         int2 _coordBTemp = _coordB; \
         _coordBTemp.y += get_local_id(0); \
         const __global Dtype *B_read = (__global Dtype *)(_B + (_coordBTemp.y * k) + _coordBTemp.x + offB); \
         _blockb = vload8(0, B_read); \
         _coordB.x += TILE_K;
+#endif
 
 #define MATB_PARAMETER __global Dtype *B, int offB, int ldb
 
@@ -598,7 +944,7 @@ GEMM_TT(0, 1, SCALAR, 1) // ALPHA != 1, BETA != 0
 #undef READ_IMAGE
 #undef SIZE_OF_ELEMENT
 
-__kernel void TEMPLATE(gemm_buffer_copy_image_transpose,Dtype)(
+__kernel void TEMPLATE(gemm_buffer_copy_image_transpose, Dtype)(
     __global Dtype* A,
     __write_only image2d_t ImA,
     int offA,
@@ -611,10 +957,14 @@ __kernel void TEMPLATE(gemm_buffer_copy_image_transpose,Dtype)(
     int2 coord_dst = (int2)(gidx, gidy);
     __global Dtype* A_off = A + offA;
     Dtype srcA = A_off[gidy * ldA + gidx];
+#if TYPE == TYPE_HALF
+    write_imageh(ImA, coord_dst, (Dtype4)srcA);
+#else
     write_imagef(ImA, coord_dst, (Dtype4)srcA);
+#endif
 }
 
-__kernel void TEMPLATE(gemm_buffer_copy_image_no_transpose,Dtype)(
+__kernel void TEMPLATE(gemm_buffer_copy_image_no_transpose, Dtype)(
     __global Dtype* A,
     __write_only image2d_t ImA,
     int offA,
@@ -625,6 +975,14 @@ __kernel void TEMPLATE(gemm_buffer_copy_image_no_transpose,Dtype)(
     const int gidx = get_global_id(0);
     const int gidy = get_global_id(1);
     int2 coord_dst = (int2)(gidx, gidy);
+#if TYPE == TYPE_HALF
+    if (gidx >= width || gidy >= height) {
+      write_imageh(ImA, coord_dst, 0);
+      return;
+    }
+    __global Dtype* A_off = A + offA;
+    write_imageh(ImA, coord_dst, A_off[gidy * ldA + gidx]);
+#else
     if (gidx >= width || gidy >= height) {
       write_imageui(ImA, coord_dst, (uint4)0);
       return;
@@ -632,4 +990,5 @@ __kernel void TEMPLATE(gemm_buffer_copy_image_no_transpose,Dtype)(
     __global Dtype* A_off = A + offA;
     uint4 srcA = convert_uint4(as_uchar4(A_off[gidy * ldA + gidx]));
     write_imageui(ImA, coord_dst, srcA);
+#endif
 }
diff --git a/modules/dnn/src/opencl/math.cl b/modules/dnn/src/opencl/math.cl
index b8f4eff010..2be4f9f485 100644
--- a/modules/dnn/src/opencl/math.cl
+++ b/modules/dnn/src/opencl/math.cl
@@ -40,16 +40,20 @@
 //
 //M*/
 
+#if defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
+
 #define CONCAT(A,B) A##_##B
 #define TEMPLATE(name,type) CONCAT(name,type)
-#define Dtype float
+#define KERNEL_ARG_DTYPE float
 
-__kernel void TEMPLATE(axpy,Dtype)(const int n, const Dtype alpha, __global const Dtype* x,
+__kernel void TEMPLATE(axpy,Dtype)(const int n, const KERNEL_ARG_DTYPE alpha, __global const Dtype* x,
                                    const int offx, __global Dtype* y,
                                    const int offy) {
   for (int index = get_global_id(0); index < n; index += get_global_size(0)) {
     Dtype src = x[offx + index];
     Dtype dst = y[offy + index];
-    y[offy + index] = alpha * src + dst;
+    y[offy + index] = convert_Dtype(alpha) * src + dst;
   }
 }
diff --git a/modules/dnn/src/opencl/matvec_mul.cl b/modules/dnn/src/opencl/matvec_mul.cl
index 0dabd62c54..849c4903d4 100644
--- a/modules/dnn/src/opencl/matvec_mul.cl
+++ b/modules/dnn/src/opencl/matvec_mul.cl
@@ -39,41 +39,45 @@
 //
 //M*/
 
+#if defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
+
 #define CONCAT(A,B) A##_##B
 #define TEMPLATE(name,type) CONCAT(name,type)
-#define Dtype float
+#define KERNEL_ARG_DTYPE float
 
 __kernel void TEMPLATE(matvec_mul4,Dtype)(
-          __global const float * A,
+          __global const Dtype * A,
           int offA,
           unsigned int A_col_size,
           unsigned int trail_item,
-          __global const float * v,
+          __global const Dtype * v,
           int offv,
-          float alpha,
-          float beta,
-          __global float4 * result,
+          KERNEL_ARG_DTYPE alpha,
+          KERNEL_ARG_DTYPE beta,
+          __global Dtype4* result,
           int offr,
-          __local float4 * work)
+          __local Dtype4* work)
 {
   unsigned int row_gid = get_group_id(0);
   unsigned int lid = get_local_id(0);
-  const __global float *src0_read = A + row_gid * 4 * A_col_size + offA;
-  const __global float *src1_read = v + offv;
-  result = (__global float4*)((__global float*)result + offr);
-  float4 dot0 = (float4)(0.f);
-  float4 dot1 = (float4)(0.f);
-  float4 dot2 = (float4)(0.f);
-  float4 dot3 = (float4)(0.f);
+  const __global Dtype *src0_read = A + row_gid * 4 * A_col_size + offA;
+  const __global Dtype *src1_read = v + offv;
+  result = (__global Dtype4*)((__global Dtype*)result + offr);
+  Dtype4 dot0 = (Dtype4)(0.f);
+  Dtype4 dot1 = (Dtype4)(0.f);
+  Dtype4 dot2 = (Dtype4)(0.f);
+  Dtype4 dot3 = (Dtype4)(0.f);
 
   unsigned int i = lid;
   while( i < A_col_size / 4) {
-    const float4 a0 = vload4(i, src0_read);
-    const float4 a1 = vload4(i, src0_read + A_col_size);
-    const float4 a2 = vload4(i, src0_read + 2 * A_col_size);
-    const float4 a3 = vload4(i, src0_read + 3 * A_col_size);
+    const Dtype4 a0 = vload4(i, src0_read);
+    const Dtype4 a1 = vload4(i, src0_read + A_col_size);
+    const Dtype4 a2 = vload4(i, src0_read + 2 * A_col_size);
+    const Dtype4 a3 = vload4(i, src0_read + 3 * A_col_size);
 
-    const float4 b0 = vload4(i, src1_read);
+    const Dtype4 b0 = vload4(i, src1_read);
 
     dot0 += a0 * b0;
     dot1 += a1 * b0;
@@ -92,15 +96,15 @@ __kernel void TEMPLATE(matvec_mul4,Dtype)(
   {
     if(trail_item != 0)
     {
-      const __global float *src0_trail = src0_read + i * 4;
-      const __global float *src1_trail = src1_read + i * 4;
+      const __global Dtype *src0_trail = src0_read + i * 4;
+      const __global Dtype *src1_trail = src1_read + i * 4;
       for(unsigned int i = 0; i < trail_item; ++i) {
-        const float at0 = src0_trail[i];
-        const float at1 = src0_trail[i + A_col_size];
-        const float at2 = src0_trail[i + 2 * A_col_size];
-        const float at3 = src0_trail[i + 3 * A_col_size];
+        const Dtype at0 = src0_trail[i];
+        const Dtype at1 = src0_trail[i + A_col_size];
+        const Dtype at2 = src0_trail[i + 2 * A_col_size];
+        const Dtype at3 = src0_trail[i + 3 * A_col_size];
 
-        const float bt = src1_trail[i];
+        const Dtype bt = src1_trail[i];
 
         work[lid].s0 += at0 * bt;
         work[lid].s1 += at1 * bt;
@@ -118,40 +122,40 @@ __kernel void TEMPLATE(matvec_mul4,Dtype)(
   }
   if(lid == 0) {
     if(beta == (Dtype)0)
-      result[row_gid] = alpha * work[0];
+      result[row_gid] = convert_Dtype(alpha) * work[0];
     else
-      result[row_gid] = alpha * work[0] + beta * result[row_gid];
+      result[row_gid] = convert_Dtype(alpha) * work[0] + convert_Dtype(beta) * result[row_gid];
   }
 }
 
 /* This kernel used for the trailing rows when row_of_A %4 !=0 */
 __kernel void TEMPLATE(matvec_mul1,Dtype)(
-          __global const float * A,
+          __global const Dtype * A,
           int offA,
           unsigned int A_col_size,
           unsigned int row_offset,
           unsigned int trail_item,
-          __global const float * v,
+          __global const Dtype * v,
           int offv,
-          float alpha,
-          float beta,
-          __global float * result,
+          KERNEL_ARG_DTYPE alpha,
+          KERNEL_ARG_DTYPE beta,
+          __global Dtype * result,
           int offr,
-          __local float * work)
+          __local Dtype * work)
 {
   unsigned int row_gid = get_group_id(0);
   unsigned int lid = get_local_id(0);
 
-  const __global float *src0_read = A + (row_offset + row_gid) * A_col_size + offA;
-  const __global float *src1_read = v + + offv;
+  const __global Dtype *src0_read = A + (row_offset + row_gid) * A_col_size + offA;
+  const __global Dtype *src1_read = v + + offv;
   result = result + offr;
-  float4 dot0 = (float4)(0.f);
+  Dtype4 dot0 = (Dtype4)(0.f);
 
   unsigned int i = lid;
   while( i < A_col_size / 4)
   {
-    const float4 a0 = vload4(i, src0_read);
-    const float4 b0 = vload4(i, src1_read);
+    const Dtype4 a0 = vload4(i, src0_read);
+    const Dtype4 b0 = vload4(i, src1_read);
 
     dot0 += a0 * b0;
     i += get_local_size(0);
@@ -163,11 +167,11 @@ __kernel void TEMPLATE(matvec_mul1,Dtype)(
   {
     if(trail_item != 0)
     {
-      const __global float *src0_trail = src0_read + i * 4;
-      const __global float *src1_trail = src1_read + i * 4;
+      const __global Dtype *src0_trail = src0_read + i * 4;
+      const __global Dtype *src1_trail = src1_read + i * 4;
       for(unsigned int i = 0; i < trail_item; ++i) {
-        const float at0 = src0_trail[i];
-        const float bt = src1_trail[i];
+        const Dtype at0 = src0_trail[i];
+        const Dtype bt = src1_trail[i];
 
         work[lid] += at0 * bt;
       }
@@ -182,10 +186,10 @@ __kernel void TEMPLATE(matvec_mul1,Dtype)(
 
   if(lid == 0) {
     if(beta == (Dtype)0) {
-      result[row_gid+row_offset] = alpha * work[0];
+      result[row_gid+row_offset] = convert_Dtype(alpha) * work[0];
     } else {
-      result[row_gid+row_offset] *= beta;
-      result[row_gid+row_offset] += alpha * work[0];
+      result[row_gid+row_offset] *= convert_Dtype(beta);
+      result[row_gid+row_offset] += convert_Dtype(alpha) * work[0];
     }
   }
 }
diff --git a/modules/dnn/src/opencl/mvn.cl b/modules/dnn/src/opencl/mvn.cl
index 9f8ab574ca..49a8ebbe64 100644
--- a/modules/dnn/src/opencl/mvn.cl
+++ b/modules/dnn/src/opencl/mvn.cl
@@ -40,7 +40,11 @@
 //
 //M*/
 
-#define Dtype float
+#if defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
+
+#define Dtype  float
 #define Dtype4 float4
 #define Dtype8 float8
 
@@ -135,17 +139,17 @@ __kernel void MVN(__global const Dtype* src,
     store(dst_vec, dst, index);
 }
 
-__kernel void MEAN_FUSE(__global const Dtype * A,
+__kernel void MEAN_FUSE(__global const T * A,
                         unsigned int A_col_size,
                         float alpha,
-                        __global Dtype4 * result,
-                        __global Dtype * B,
+                        __global T4 * mean,
+                        __global Dtype * tmp,
                         __local Dtype4 * work)
 {
     unsigned int row_gid = get_group_id(0);
     unsigned int lid = get_local_id(0);
-    const __global Dtype *src0_read = A + row_gid * 4 * A_col_size;
-    __global Dtype *dst0_read = B + row_gid * 4 * A_col_size;
+    const __global T *src0_read = A + row_gid * 4 * A_col_size;
+    __global Dtype *dst0_read = tmp + row_gid * 4 * A_col_size;
     Dtype4 dot0, dot1, dot2, dot3;
     dot0 = dot1 = dot2 = dot3 = (Dtype4)(0.f);
 
@@ -153,15 +157,15 @@ __kernel void MEAN_FUSE(__global const Dtype * A,
     const Dtype4 b0 = (Dtype4)1.f;
     while( i < A_col_size / 4)
     {
-        const Dtype4 a0 = vload4(i, src0_read);
-        const Dtype4 a1 = vload4(i, src0_read + A_col_size);
-        const Dtype4 a2 = vload4(i, src0_read + 2 * A_col_size);
-        const Dtype4 a3 = vload4(i, src0_read + 3 * A_col_size);
+        const T4 a0 = vload4(i, src0_read);
+        const T4 a1 = vload4(i, src0_read + A_col_size);
+        const T4 a2 = vload4(i, src0_read + 2 * A_col_size);
+        const T4 a3 = vload4(i, src0_read + 3 * A_col_size);
 
-        dot0 += a0;
-        dot1 += a1;
-        dot2 += a2;
-        dot3 += a3;
+        dot0 += convert_float4(a0);
+        dot1 += convert_float4(a1);
+        dot2 += convert_float4(a2);
+        dot3 += convert_float4(a3);
 
         i += get_local_size(0);
     }
@@ -181,22 +185,22 @@ __kernel void MEAN_FUSE(__global const Dtype * A,
 
     if(lid == 0)
     {
-        result[row_gid] = alpha * work[0];
+        mean[row_gid] = convert_T(alpha * work[0]);
     }
 
     Dtype4 sum = work[0] * alpha;
     i = lid;
     while( i < A_col_size / 4)
     {
-        const Dtype4 a0 = vload4(i, src0_read);
-        const Dtype4 a1 = vload4(i, src0_read + A_col_size);
-        const Dtype4 a2 = vload4(i, src0_read + 2 * A_col_size);
-        const Dtype4 a3 = vload4(i, src0_read + 3 * A_col_size);
+        const T4 a0 = vload4(i, src0_read);
+        const T4 a1 = vload4(i, src0_read + A_col_size);
+        const T4 a2 = vload4(i, src0_read + 2 * A_col_size);
+        const T4 a3 = vload4(i, src0_read + 3 * A_col_size);
 
-        dot0 = native_powr(a0 - (Dtype4)sum.x, 2);
-        dot1 = native_powr(a1 - (Dtype4)sum.y, 2);
-        dot2 = native_powr(a2 - (Dtype4)sum.z, 2);
-        dot3 = native_powr(a3 - (Dtype4)sum.w, 2);
+        dot0 = native_powr(convert_float4(a0) - (Dtype4)sum.x, 2);
+        dot1 = native_powr(convert_float4(a1) - (Dtype4)sum.y, 2);
+        dot2 = native_powr(convert_float4(a2) - (Dtype4)sum.z, 2);
+        dot3 = native_powr(convert_float4(a3) - (Dtype4)sum.w, 2);
 
         vstore4(dot0, i, dst0_read);
         vstore4(dot1, i, dst0_read + A_col_size);
@@ -208,22 +212,22 @@ __kernel void MEAN_FUSE(__global const Dtype * A,
 }
 
 __kernel void MVN_FUSE(__global const Dtype * tmp,
-                       __global const Dtype * A,
-                       __global const Dtype4 * mean,
+                       __global const T * A,
+                       __global const T4 * mean,
                        unsigned int A_col_size,
                        const float alpha_val,
                        const float eps,
                        const float relu_slope,
                        __global const Dtype4 * bnorm_weight,
                        __global const Dtype4 * bnorm_bias,
-                       __global Dtype * B,
+                       __global T * B,
                        __local Dtype4 * work)
 {
     unsigned int row_gid = get_group_id(0);
     unsigned int lid = get_local_id(0);
     const __global Dtype *src0_read = tmp + row_gid * 4 * A_col_size;
-    const __global Dtype *src1_read = A + row_gid * 4 * A_col_size;
-    __global Dtype *dst0_read = B + row_gid * 4 * A_col_size;
+    const __global T *src1_read = A + row_gid * 4 * A_col_size;
+    __global T *dst0_read = B + row_gid * 4 * A_col_size;
     Dtype4 dot0, dot1, dot2, dot3;
     dot0 = dot1 = dot2 = dot3 = (Dtype4)(0.f);
 
@@ -257,7 +261,7 @@ __kernel void MVN_FUSE(__global const Dtype * tmp,
     }
     barrier(CLK_LOCAL_MEM_FENCE);
 
-    Dtype4 mean_val = mean[row_gid];
+    Dtype4 mean_val = convert_float4(mean[row_gid]);
     Dtype4 dev_val = sqrt(work[0] * alpha_val) + (Dtype4)eps;
     Dtype4 alpha = (Dtype4)1.f / dev_val;
 
@@ -271,15 +275,15 @@ __kernel void MVN_FUSE(__global const Dtype * tmp,
     i = lid;
     while( i < A_col_size / 4)
     {
-        const Dtype4 a0 = vload4(i, src1_read);
-        const Dtype4 a1 = vload4(i, src1_read + A_col_size);
-        const Dtype4 a2 = vload4(i, src1_read + 2 * A_col_size);
-        const Dtype4 a3 = vload4(i, src1_read + 3 * A_col_size);
+        const T4 a0 = vload4(i, src1_read);
+        const T4 a1 = vload4(i, src1_read + A_col_size);
+        const T4 a2 = vload4(i, src1_read + 2 * A_col_size);
+        const T4 a3 = vload4(i, src1_read + 3 * A_col_size);
 
-        dot0 = (a0 - (Dtype4)mean_val.x) * alpha.x;
-        dot1 = (a1 - (Dtype4)mean_val.y) * alpha.y;
-        dot2 = (a2 - (Dtype4)mean_val.z) * alpha.z;
-        dot3 = (a3 - (Dtype4)mean_val.w) * alpha.w;
+        dot0 = (convert_float4(a0) - (Dtype4)mean_val.x) * alpha.x;
+        dot1 = (convert_float4(a1) - (Dtype4)mean_val.y) * alpha.y;
+        dot2 = (convert_float4(a2) - (Dtype4)mean_val.z) * alpha.z;
+        dot3 = (convert_float4(a3) - (Dtype4)mean_val.w) * alpha.w;
 
         dot0 = dot0 * w.x + (Dtype4)b.x;
         dot1 = dot1 * w.y + (Dtype4)b.y;
@@ -300,10 +304,10 @@ __kernel void MVN_FUSE(__global const Dtype * tmp,
         dot3 = select(new3, dot3, dot3 > (Dtype4)0.f);
 #endif
 
-        vstore4(dot0, i, dst0_read);
-        vstore4(dot1, i, dst0_read + A_col_size);
-        vstore4(dot2, i, dst0_read + 2 * A_col_size);
-        vstore4(dot3, i, dst0_read + 3 * A_col_size);
+        vstore4(convert_T(dot0), i, dst0_read);
+        vstore4(convert_T(dot1), i, dst0_read + A_col_size);
+        vstore4(convert_T(dot2), i, dst0_read + 2 * A_col_size);
+        vstore4(convert_T(dot3), i, dst0_read + 3 * A_col_size);
 
         i += get_local_size(0);
     }
diff --git a/modules/dnn/src/opencl/ocl4dnn_lrn.cl b/modules/dnn/src/opencl/ocl4dnn_lrn.cl
index 58477cef0c..36d9d2ae04 100644
--- a/modules/dnn/src/opencl/ocl4dnn_lrn.cl
+++ b/modules/dnn/src/opencl/ocl4dnn_lrn.cl
@@ -42,14 +42,18 @@
 
 #define CONCAT(A,B) A##_##B
 #define TEMPLATE(name,type) CONCAT(name,type)
-#define Dtype float
+#define KERNEL_ARG_DTYPE float
+
+#if defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
 
 __kernel void TEMPLATE(lrn_full_no_scale,Dtype)(const int nthreads, __global const Dtype* in,
                              const int num, const int channels,
                              const int height, const int width, const int size,
-                             const Dtype alpha_over_size, const Dtype k,
+                             const KERNEL_ARG_DTYPE alpha_over_size, const KERNEL_ARG_DTYPE k,
                              __global Dtype* const out,
-                             const Dtype negative_beta) {
+                             const KERNEL_ARG_DTYPE negative_beta) {
   for (int index = get_global_id(0); index < nthreads;
       index += get_global_size(0)) {
     // find out the local offset
@@ -60,11 +64,11 @@ __kernel void TEMPLATE(lrn_full_no_scale,Dtype)(const int nthreads, __global con
     const int step = height * width;
     __global const Dtype* in_off = in + offset;
     __global Dtype* out_off = out + offset;
-    Dtype scale_val;
+    KERNEL_ARG_DTYPE scale_val;
     int head = 0;
     const int pre_pad = (size - 1) / 2;
     const int post_pad = size - pre_pad - 1;
-    Dtype accum_scale = 0;
+    KERNEL_ARG_DTYPE accum_scale = 0;
     // fill the scale at [n, :, h, w]
     // accumulate values
     while (head < post_pad && head < channels) {
@@ -79,7 +83,7 @@ __kernel void TEMPLATE(lrn_full_no_scale,Dtype)(const int nthreads, __global con
             * in_off[(head - size) * step];
       }
       scale_val = k + accum_scale * alpha_over_size;
-      out_off[(head - post_pad) * step] = in_off[(head - post_pad) * step] * (Dtype)native_powr((float)scale_val, (float)negative_beta);
+      out_off[(head - post_pad) * step] = in_off[(head - post_pad) * step] * (Dtype)native_powr((Dtype)scale_val, (Dtype)negative_beta);
       ++head;
     }
     // subtract only
@@ -89,7 +93,7 @@ __kernel void TEMPLATE(lrn_full_no_scale,Dtype)(const int nthreads, __global con
             * in_off[(head - size) * step];
       }
       scale_val = k + accum_scale * alpha_over_size;
-      out_off[(head - post_pad) * step] = in_off[(head - post_pad) * step] * (Dtype)native_powr((float)scale_val, (float)negative_beta);
+      out_off[(head - post_pad) * step] = in_off[(head - post_pad) * step] * (Dtype)native_powr((Dtype)scale_val, (Dtype)negative_beta);
       ++head;
     }
   }
diff --git a/modules/dnn/src/opencl/ocl4dnn_pooling.cl b/modules/dnn/src/opencl/ocl4dnn_pooling.cl
index 13e4319172..e9d1d26f0f 100644
--- a/modules/dnn/src/opencl/ocl4dnn_pooling.cl
+++ b/modules/dnn/src/opencl/ocl4dnn_pooling.cl
@@ -42,7 +42,10 @@
 
 #define CONCAT(A,B) A##_##B
 #define TEMPLATE(name,type) CONCAT(name,type)
-#define Dtype float
+
+#if defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
 
 #if defined KERNEL_MAX_POOL
 
diff --git a/modules/dnn/src/opencl/permute.cl b/modules/dnn/src/opencl/permute.cl
index 38aa7990c1..9e709f201c 100644
--- a/modules/dnn/src/opencl/permute.cl
+++ b/modules/dnn/src/opencl/permute.cl
@@ -40,7 +40,9 @@
 //
 //M*/
 
-#define Dtype float
+#if defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
 
 __kernel void permute(const int nthreads,
                       __global Dtype* bottom_data,
diff --git a/modules/dnn/src/opencl/prior_box.cl b/modules/dnn/src/opencl/prior_box.cl
index c51cd43830..6ffbf8df29 100644
--- a/modules/dnn/src/opencl/prior_box.cl
+++ b/modules/dnn/src/opencl/prior_box.cl
@@ -39,17 +39,18 @@
 //
 //M*/
 
-#define Dtype float
-#define Dtype4 float4
+#if defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
 
 __kernel void prior_box(const int nthreads,
-                        const Dtype stepX,
-                        const Dtype stepY,
-                        __global const Dtype* _offsetsX,
-                        __global const Dtype* _offsetsY,
+                        const float stepX,
+                        const float stepY,
+                        __global const float* _offsetsX,
+                        __global const float* _offsetsY,
                         const int offsetsX_size,
-                        __global const Dtype* _widths,
-                        __global const Dtype* _heights,
+                        __global const float* _widths,
+                        __global const float* _heights,
                         const int widths_size,
                         __global Dtype* dst,
                         const int _layerHeight,
@@ -65,7 +66,7 @@ __kernel void prior_box(const int nthreads,
 
         outputPtr = dst + index * 4 * offsetsX_size * widths_size;
 
-        Dtype _boxWidth, _boxHeight;
+        float _boxWidth, _boxHeight;
         Dtype4 vec;
         for (int i = 0; i < widths_size; ++i)
         {
@@ -73,8 +74,8 @@ __kernel void prior_box(const int nthreads,
             _boxHeight = _heights[i];
             for (int j = 0; j < offsetsX_size; ++j)
             {
-                float center_x = (w + _offsetsX[j]) * stepX;
-                float center_y = (h + _offsetsY[j]) * stepY;
+                Dtype center_x = (w + _offsetsX[j]) * (Dtype)stepX;
+                Dtype center_y = (h + _offsetsY[j]) * (Dtype)stepY;
 
                 vec.x = (center_x - _boxWidth * 0.5f) / imgWidth;    // xmin
                 vec.y = (center_y - _boxHeight * 0.5f) / imgHeight;  // ymin
@@ -91,7 +92,7 @@ __kernel void prior_box(const int nthreads,
 __kernel void set_variance(const int nthreads,
                            const int offset,
                            const int variance_size,
-                           __global const Dtype* variance,
+                           __global const float* variance,
                            __global Dtype* dst)
 {
     for (int index = get_global_id(0); index < nthreads; index += get_global_size(0))
@@ -101,7 +102,7 @@ __kernel void set_variance(const int nthreads,
         if (variance_size == 1)
             var_vec = (Dtype4)(variance[0]);
         else
-            var_vec = vload4(0, variance);
+            var_vec = convert_T(vload4(0, variance));
 
         vstore4(var_vec, 0, dst + offset + index * 4);
     }
diff --git a/modules/dnn/src/opencl/reorg.cl b/modules/dnn/src/opencl/reorg.cl
index a4b9caea84..62df3cceca 100644
--- a/modules/dnn/src/opencl/reorg.cl
+++ b/modules/dnn/src/opencl/reorg.cl
@@ -39,6 +39,10 @@
 //
 //M*/
 
+#if defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
+
 __kernel void reorg(const int count,
                     __global const Dtype* src,
                     const int channels,
diff --git a/modules/dnn/src/opencl/slice.cl b/modules/dnn/src/opencl/slice.cl
index 37ba17c548..5f96a4e4c8 100644
--- a/modules/dnn/src/opencl/slice.cl
+++ b/modules/dnn/src/opencl/slice.cl
@@ -40,9 +40,9 @@
 //
 //M*/
 
-#define Dtype float
-#define Dtype4 float4
-#define Dtype8 float8
+#if defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
 
 __kernel void slice(__global const Dtype* src,
                     const int src_plane_size,
diff --git a/modules/dnn/src/opencl/softmax.cl b/modules/dnn/src/opencl/softmax.cl
index 54cf489501..6b525e2ead 100644
--- a/modules/dnn/src/opencl/softmax.cl
+++ b/modules/dnn/src/opencl/softmax.cl
@@ -24,6 +24,10 @@
  * POSSIBILITY OF SUCH DAMAGE.
  **************************************************************************************/
 
+#if defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
+
 __kernel void kernel_channel_max(const int num, const int channels,
     const int spatial_dim, __global const T* data, __global T* out) {
   int index = get_global_id(0);
@@ -40,12 +44,12 @@ __kernel void kernel_channel_max(const int num, const int channels,
 
 __kernel void kernel_channel_subtract(const int count,
     const int num, const int channels,
-    const int spatial_dim, __global const T* channel_max, __global T* data) {
+    const int spatial_dim, __global const T* channel_max, __global const T* src, __global T* data) {
   int index = get_global_id(0);
   if(index < count) {
     int n = index / channels / spatial_dim;
     int s = index % spatial_dim;
-    data[index] -= channel_max[n * spatial_dim + s];
+    data[index] = exp(src[index] - channel_max[n * spatial_dim + s]);
   }
 }
 
diff --git a/modules/dnn/src/opencl/softmax_loss.cl b/modules/dnn/src/opencl/softmax_loss.cl
index 28a43ae2d6..8ea52cfa83 100644
--- a/modules/dnn/src/opencl/softmax_loss.cl
+++ b/modules/dnn/src/opencl/softmax_loss.cl
@@ -42,12 +42,15 @@
 
 #define CONCAT(A,B) A##_##B
 #define TEMPLATE(name,type) CONCAT(name,type)
-#define Dtype float
 
 #if defined(cl_intel_subgroups)
 #pragma OPENCL EXTENSION  cl_intel_subgroups : enable
 #endif
 
+#if defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
+
 __kernel void TEMPLATE(softmax_forward_slm,Dtype)(const int num, const int channels,
                                    const int spatial_dim,
                                    __global Dtype* scale,
@@ -60,12 +63,12 @@ __kernel void TEMPLATE(softmax_forward_slm,Dtype)(const int num, const int chann
   int n = get_global_id(1);
   for (int index = get_global_id(0), s = 0; index < spatial_dim * get_local_size(0); index +=
       get_global_size(0), ++s) {
-    float maxval = -FLT_MAX;
+    Dtype maxval = -DTYPE_MAX;
     for (int c = get_global_id(0); c < channels; c += get_global_size(0)) {
       Dtype tmp = data[(n * channels + c) * spatial_dim + s];
       maxval = max((Dtype)tmp, (Dtype)maxval);
     }
-    maxval = sub_group_reduce_max(maxval * 100000);
+    maxval = sub_group_reduce_max(maxval);
     //if (get_sub_group_local_id() == 0)
     group_tmp[get_sub_group_id() * spatial_dim + s] = maxval;
   }
@@ -77,7 +80,7 @@ __kernel void TEMPLATE(softmax_forward_slm,Dtype)(const int num, const int chann
     int s = index / get_max_sub_group_size();
     Dtype maxval = sub_group_reduce_max(group_tmp[get_sub_group_local_id() * spatial_dim + s]);
     //if (get_sub_group_local_id() == 0)
-    scale_tmp[s] = maxval / 100000;
+    scale_tmp[s] = maxval;
   }
 
   barrier(CLK_LOCAL_MEM_FENCE);
@@ -95,7 +98,7 @@ __kernel void TEMPLATE(softmax_forward_slm,Dtype)(const int num, const int chann
     for (int c = get_global_id(0); c < channels; c += get_global_size(0)) {
       sum += out_tmp[c * spatial_dim + s];
     }
-    sum = sub_group_reduce_add(sum * 100000);
+    sum = sub_group_reduce_add(sum);
     group_tmp[get_sub_group_id() * spatial_dim + s] = sum;
   }
   barrier(CLK_LOCAL_MEM_FENCE);
@@ -105,7 +108,7 @@ __kernel void TEMPLATE(softmax_forward_slm,Dtype)(const int num, const int chann
     int s = index / get_max_sub_group_size();
     Dtype sum = sub_group_reduce_add(group_tmp[get_sub_group_local_id() * spatial_dim + s]);
     //if (get_sub_group_local_id() == 0)
-    scale_tmp[s] = sum / 100000;
+    scale_tmp[s] = sum;
   }
   barrier(CLK_LOCAL_MEM_FENCE);
 
@@ -130,12 +133,12 @@ __kernel void TEMPLATE(softmax_forward,Dtype)(const int num, const int channels,
   __global Dtype *group_tmp = scale + spatial_dim * num + n * get_max_sub_group_size() * spatial_dim;
   for (int index = get_global_id(0), s = 0; index < spatial_dim * get_local_size(0); index +=
       get_global_size(0), ++s) {
-    float maxval = -FLT_MAX;
+    Dtype maxval = -DTYPE_MAX;
     for (int c = get_global_id(0); c < channels; c += get_global_size(0)) {
       Dtype tmp = data[(n * channels + c) * spatial_dim + s];
       maxval = max((Dtype)tmp, (Dtype)maxval);
     }
-    maxval = sub_group_reduce_max(maxval * 100000);
+    maxval = sub_group_reduce_max(maxval);
     //if (get_sub_group_local_id() == 0)
     group_tmp[get_sub_group_id() * spatial_dim + s] = maxval;
   }
@@ -146,7 +149,7 @@ __kernel void TEMPLATE(softmax_forward,Dtype)(const int num, const int channels,
     int s = index / get_max_sub_group_size();
     Dtype maxval = sub_group_reduce_max(group_tmp[get_sub_group_local_id() * spatial_dim + s]);
     //if (get_sub_group_local_id() == 0)
-    scale[n * spatial_dim + s] = maxval / 100000;
+    scale[n * spatial_dim + s] = maxval;
   }
 
   barrier(CLK_GLOBAL_MEM_FENCE);
@@ -164,7 +167,7 @@ __kernel void TEMPLATE(softmax_forward,Dtype)(const int num, const int channels,
     for (int c = get_global_id(0); c < channels; c += get_global_size(0)) {
       sum += out[n * channels * spatial_dim + c * spatial_dim + s];
     }
-    sum = sub_group_reduce_add(sum * 100000);
+    sum = sub_group_reduce_add(sum);
     group_tmp[get_sub_group_id() * spatial_dim + s] = sum;
   }
   barrier(CLK_GLOBAL_MEM_FENCE);
@@ -174,7 +177,7 @@ __kernel void TEMPLATE(softmax_forward,Dtype)(const int num, const int channels,
     int s = index / get_max_sub_group_size();
     Dtype sum = sub_group_reduce_add(group_tmp[get_sub_group_local_id() * spatial_dim + s]);
     //if (get_sub_group_local_id() == 0)
-    scale[n * spatial_dim + s] = sum / 100000;
+    scale[n * spatial_dim + s] = sum;
   }
   barrier(CLK_GLOBAL_MEM_FENCE);
 
diff --git a/modules/dnn/src/precomp.hpp b/modules/dnn/src/precomp.hpp
index 356eaff165..f6230c4c6d 100644
--- a/modules/dnn/src/precomp.hpp
+++ b/modules/dnn/src/precomp.hpp
@@ -64,6 +64,7 @@
 
 namespace cv { namespace dnn {
 CV__DNN_EXPERIMENTAL_NS_BEGIN
+#define IS_DNN_OPENCL_TARGET(id) (id == DNN_TARGET_OPENCL || id == DNN_TARGET_OPENCL_FP16)
 Mutex& getInitializationMutex();
 void initializeLayerFactory();
 CV__DNN_EXPERIMENTAL_NS_END
diff --git a/modules/dnn/src/tensorflow/tf_graph_simplifier.cpp b/modules/dnn/src/tensorflow/tf_graph_simplifier.cpp
index 677f57ab7d..9208588e65 100644
--- a/modules/dnn/src/tensorflow/tf_graph_simplifier.cpp
+++ b/modules/dnn/src/tensorflow/tf_graph_simplifier.cpp
@@ -538,6 +538,37 @@ public:
     }
 };
 
+// In case of resizing by factor.
+class ResizeBilinearSubgraph : public Subgraph
+{
+public:
+    ResizeBilinearSubgraph()
+    {
+        int input = addNodeToMatch("");
+
+        int shape = addNodeToMatch("Shape", input);
+        int stack = addNodeToMatch("Const");
+        int stack_1 = addNodeToMatch("Const");
+        int stack_2 = addNodeToMatch("Const");
+        int strided_slice = addNodeToMatch("StridedSlice", shape, stack, stack_1, stack_2);
+        int factorY = addNodeToMatch("Const");
+        int mul = addNodeToMatch("Mul", strided_slice, factorY);
+
+        shape = addNodeToMatch("Shape", input);
+        stack = addNodeToMatch("Const");
+        stack_1 = addNodeToMatch("Const");
+        stack_2 = addNodeToMatch("Const");
+        strided_slice = addNodeToMatch("StridedSlice", shape, stack, stack_1, stack_2);
+        int factorX = addNodeToMatch("Const");
+        int mul_1 = addNodeToMatch("Mul", strided_slice, factorX);
+
+        int pack = addNodeToMatch("Pack", mul, mul_1);
+
+        addNodeToMatch("ResizeBilinear", input, pack);
+        setFusedNode("ResizeBilinear", input, factorY, factorX);
+    }
+};
+
 void simplifySubgraphs(tensorflow::GraphDef& net)
 {
     std::vector<Ptr<Subgraph> > subgraphs;
@@ -551,6 +582,7 @@ void simplifySubgraphs(tensorflow::GraphDef& net)
     subgraphs.push_back(Ptr<Subgraph>(new L2NormalizeSubgraph()));
     subgraphs.push_back(Ptr<Subgraph>(new DeconvolutionValidKerasSubgraph()));
     subgraphs.push_back(Ptr<Subgraph>(new DeconvolutionSameKerasSubgraph()));
+    subgraphs.push_back(Ptr<Subgraph>(new ResizeBilinearSubgraph()));
 
     int numNodes = net.node_size();
     std::vector<int> matchedNodesIds;
diff --git a/modules/dnn/src/tensorflow/tf_importer.cpp b/modules/dnn/src/tensorflow/tf_importer.cpp
index 667e573705..efedbceb48 100644
--- a/modules/dnn/src/tensorflow/tf_importer.cpp
+++ b/modules/dnn/src/tensorflow/tf_importer.cpp
@@ -767,6 +767,26 @@ void TFImporter::populateNet(Net dstNet)
                 }
             }
         }
+        else if (type == "Sub")
+        {
+            bool haveConst = false;
+            for(int ii = 0; !haveConst && ii < layer.input_size(); ++ii)
+            {
+                Pin input = parsePin(layer.input(ii));
+                haveConst = value_id.find(input.name) != value_id.end();
+            }
+            CV_Assert(haveConst);
+
+            layerParams.blobs.resize(1);
+            blobFromTensor(getConstBlob(layer, value_id), layerParams.blobs[0]);
+            layerParams.blobs[0] *= -1;
+
+            int id = dstNet.addLayer(name, "Shift", layerParams);
+            layer_id[name] = id;
+
+            // one input only
+            connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0);
+        }
         else if (type == "MatMul")
         {
             CV_Assert(layer.input_size() == 2);
diff --git a/modules/dnn/test/test_backends.cpp b/modules/dnn/test/test_backends.cpp
index 2752d7ab2a..2bcd357e2e 100644
--- a/modules/dnn/test/test_backends.cpp
+++ b/modules/dnn/test/test_backends.cpp
@@ -147,7 +147,9 @@ TEST_P(DNNTestNetwork, Inception_5h)
 
 TEST_P(DNNTestNetwork, ENet)
 {
-    if (backend == DNN_BACKEND_INFERENCE_ENGINE) throw SkipTestException("");
+    if ((backend == DNN_BACKEND_INFERENCE_ENGINE) ||
+        (backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL_FP16))
+        throw SkipTestException("");
     processNet("dnn/Enet-model-best.net", "", Size(512, 512), "l367_Deconvolution",
                target == DNN_TARGET_OPENCL ? "dnn/halide_scheduler_opencl_enet.yml" :
                                              "dnn/halide_scheduler_enet.yml",
@@ -161,9 +163,11 @@ TEST_P(DNNTestNetwork, MobileNet_SSD_Caffe)
         throw SkipTestException("");
     Mat sample = imread(findDataFile("dnn/street.png", false));
     Mat inp = blobFromImage(sample, 1.0f / 127.5, Size(300, 300), Scalar(127.5, 127.5, 127.5), false);
+    float l1 = (backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL_FP16) ? 0.0007 : 0.0;
+    float lInf = (backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL_FP16) ? 0.011 : 0.0;
 
     processNet("dnn/MobileNetSSD_deploy.caffemodel", "dnn/MobileNetSSD_deploy.prototxt",
-               inp, "detection_out");
+               inp, "detection_out", "", l1, lInf);
 }
 
 TEST_P(DNNTestNetwork, MobileNet_SSD_TensorFlow)
@@ -173,15 +177,17 @@ TEST_P(DNNTestNetwork, MobileNet_SSD_TensorFlow)
         throw SkipTestException("");
     Mat sample = imread(findDataFile("dnn/street.png", false));
     Mat inp = blobFromImage(sample, 1.0f / 127.5, Size(300, 300), Scalar(127.5, 127.5, 127.5), false);
+    float l1 = (backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL_FP16) ? 0.008 : 0.0;
+    float lInf = (backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL_FP16) ? 0.06 : 0.0;
     processNet("dnn/ssd_mobilenet_v1_coco.pb", "dnn/ssd_mobilenet_v1_coco.pbtxt",
-               inp, "detection_out");
+               inp, "detection_out", "", l1, lInf);
 }
 
 TEST_P(DNNTestNetwork, SSD_VGG16)
 {
-    if (backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL ||
-        backend == DNN_BACKEND_HALIDE && target == DNN_TARGET_CPU ||
-        backend == DNN_BACKEND_INFERENCE_ENGINE && target != DNN_TARGET_CPU)
+    if ((backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL_FP16) ||
+        (backend == DNN_BACKEND_HALIDE && target == DNN_TARGET_CPU) ||
+        (backend == DNN_BACKEND_INFERENCE_ENGINE && target != DNN_TARGET_CPU))
         throw SkipTestException("");
     processNet("dnn/VGG_ILSVRC2016_SSD_300x300_iter_440000.caffemodel",
                "dnn/ssd_vgg16.prototxt", Size(300, 300), "detection_out");
@@ -236,14 +242,17 @@ TEST_P(DNNTestNetwork, Inception_v2_SSD_TensorFlow)
         throw SkipTestException("");
     Mat sample = imread(findDataFile("dnn/street.png", false));
     Mat inp = blobFromImage(sample, 1.0f / 127.5, Size(300, 300), Scalar(127.5, 127.5, 127.5), false);
+    float l1 = (backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL_FP16) ? 0.008 : 0.0;
+    float lInf = (backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL_FP16) ? 0.07 : 0.0;
     processNet("dnn/ssd_inception_v2_coco_2017_11_17.pb", "dnn/ssd_inception_v2_coco_2017_11_17.pbtxt",
-               inp, "detection_out");
+               inp, "detection_out", "", l1, lInf);
 }
 
 TEST_P(DNNTestNetwork, DenseNet_121)
 {
-    if (backend == DNN_BACKEND_HALIDE ||
-        backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL_FP16)
+    if ((backend == DNN_BACKEND_HALIDE) ||
+        (backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL_FP16) ||
+        (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL_FP16))
         throw SkipTestException("");
     processNet("dnn/DenseNet_121.caffemodel", "dnn/DenseNet_121.prototxt", Size(224, 224), "", "caffe");
 }
@@ -258,7 +267,8 @@ const tuple<DNNBackend, DNNTarget> testCases[] = {
     tuple<DNNBackend, DNNTarget>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL),
     tuple<DNNBackend, DNNTarget>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL_FP16),
 #endif
-    tuple<DNNBackend, DNNTarget>(DNN_BACKEND_DEFAULT, DNN_TARGET_OPENCL)
+    tuple<DNNBackend, DNNTarget>(DNN_BACKEND_DEFAULT, DNN_TARGET_OPENCL),
+    tuple<DNNBackend, DNNTarget>(DNN_BACKEND_DEFAULT, DNN_TARGET_OPENCL_FP16)
 };
 
 INSTANTIATE_TEST_CASE_P(/*nothing*/, DNNTestNetwork, testing::ValuesIn(testCases));
diff --git a/modules/dnn/test/test_caffe_importer.cpp b/modules/dnn/test/test_caffe_importer.cpp
index 82b395a120..eaf95acc9d 100644
--- a/modules/dnn/test/test_caffe_importer.cpp
+++ b/modules/dnn/test/test_caffe_importer.cpp
@@ -104,7 +104,11 @@ TEST_P(Reproducibility_AlexNet, Accuracy)
         ASSERT_FALSE(net.empty());
     }
 
-    net.setPreferableTarget(get<1>(GetParam()));
+    int targetId = get<1>(GetParam());
+    const float l1 = 1e-5;
+    const float lInf = (targetId == DNN_TARGET_OPENCL_FP16) ? 3e-3 : 1e-4;
+
+    net.setPreferableTarget(targetId);
 
     Mat sample = imread(_tf("grace_hopper_227.png"));
     ASSERT_TRUE(!sample.empty());
@@ -112,10 +116,11 @@ TEST_P(Reproducibility_AlexNet, Accuracy)
     net.setInput(blobFromImage(sample, 1.0f, Size(227, 227), Scalar(), false), "data");
     Mat out = net.forward("prob");
     Mat ref = blobFromNPY(_tf("caffe_alexnet_prob.npy"));
-    normAssert(ref, out);
+    normAssert(ref, out, "", l1, lInf);
 }
 
-INSTANTIATE_TEST_CASE_P(/**/, Reproducibility_AlexNet, Combine(testing::Bool(), availableDnnTargets()));
+INSTANTIATE_TEST_CASE_P(/**/, Reproducibility_AlexNet, Combine(testing::Bool(),
+                        Values(DNN_TARGET_CPU, DNN_TARGET_OPENCL, DNN_TARGET_OPENCL_FP16)));
 
 #if !defined(_WIN32) || defined(_WIN64)
 TEST(Reproducibility_FCN, Accuracy)
@@ -176,8 +181,11 @@ TEST_P(Reproducibility_MobileNet_SSD, Accuracy)
     const string proto = findDataFile("dnn/MobileNetSSD_deploy.prototxt", false);
     const string model = findDataFile("dnn/MobileNetSSD_deploy.caffemodel", false);
     Net net = readNetFromCaffe(proto, model);
+    int targetId = GetParam();
+    const float l1 = (targetId == DNN_TARGET_OPENCL_FP16) ? 1.5e-4 : 1e-5;
+    const float lInf = (targetId == DNN_TARGET_OPENCL_FP16) ? 4e-4 : 1e-4;
 
-    net.setPreferableTarget(GetParam());
+    net.setPreferableTarget(targetId);
 
     Mat sample = imread(_tf("street.png"));
 
@@ -185,8 +193,10 @@ TEST_P(Reproducibility_MobileNet_SSD, Accuracy)
     net.setInput(inp);
     Mat out = net.forward();
 
+    const float scores_diff = (targetId == DNN_TARGET_OPENCL_FP16) ? 4e-4 : 1e-5;
+    const float boxes_iou_diff = (targetId == DNN_TARGET_OPENCL_FP16) ? 5e-3 : 1e-4;
     Mat ref = blobFromNPY(_tf("mobilenet_ssd_caffe_out.npy"));
-    normAssertDetections(ref, out);
+    normAssertDetections(ref, out, "", 0.0, scores_diff, boxes_iou_diff);
 
     // Check that detections aren't preserved.
     inp.setTo(0.0f);
@@ -212,10 +222,12 @@ TEST_P(Reproducibility_MobileNet_SSD, Accuracy)
     // a single sample in batch. The first numbers of detection vectors are batch id.
     outBatch = outBatch.reshape(1, outBatch.total() / 7);
     EXPECT_EQ(outBatch.rows, 2 * numDetections);
-    normAssert(outBatch.rowRange(0, numDetections), ref);
-    normAssert(outBatch.rowRange(numDetections, 2 * numDetections).colRange(1, 7), ref.colRange(1, 7));
+    normAssert(outBatch.rowRange(0, numDetections), ref, "", l1, lInf);
+    normAssert(outBatch.rowRange(numDetections, 2 * numDetections).colRange(1, 7), ref.colRange(1, 7),
+               "", l1, lInf);
 }
-INSTANTIATE_TEST_CASE_P(/**/, Reproducibility_MobileNet_SSD, availableDnnTargets());
+INSTANTIATE_TEST_CASE_P(/**/, Reproducibility_MobileNet_SSD,
+                        Values(DNN_TARGET_CPU, DNN_TARGET_OPENCL, DNN_TARGET_OPENCL_FP16));
 
 typedef testing::TestWithParam<DNNTarget> Reproducibility_ResNet50;
 TEST_P(Reproducibility_ResNet50, Accuracy)
@@ -226,6 +238,9 @@ TEST_P(Reproducibility_ResNet50, Accuracy)
     int targetId = GetParam();
     net.setPreferableTarget(targetId);
 
+    float l1 = (targetId == DNN_TARGET_OPENCL_FP16) ? 3e-5 : 1e-5;
+    float lInf = (targetId == DNN_TARGET_OPENCL_FP16) ? 6e-3 : 1e-4;
+
     Mat input = blobFromImage(imread(_tf("googlenet_0.png")), 1.0f, Size(224,224), Scalar(), false);
     ASSERT_TRUE(!input.empty());
 
@@ -233,20 +248,21 @@ TEST_P(Reproducibility_ResNet50, Accuracy)
     Mat out = net.forward();
 
     Mat ref = blobFromNPY(_tf("resnet50_prob.npy"));
-    normAssert(ref, out);
+    normAssert(ref, out, "", l1, lInf);
 
-    if (targetId == DNN_TARGET_OPENCL)
+    if (targetId == DNN_TARGET_OPENCL || targetId == DNN_TARGET_OPENCL_FP16)
     {
         UMat out_umat;
         net.forward(out_umat);
-        normAssert(ref, out_umat, "out_umat");
+        normAssert(ref, out_umat, "out_umat", l1, lInf);
 
         std::vector<UMat> out_umats;
         net.forward(out_umats);
-        normAssert(ref, out_umats[0], "out_umat_vector");
+        normAssert(ref, out_umats[0], "out_umat_vector", l1, lInf);
     }
 }
-INSTANTIATE_TEST_CASE_P(/**/, Reproducibility_ResNet50, availableDnnTargets());
+INSTANTIATE_TEST_CASE_P(/**/, Reproducibility_ResNet50,
+                        Values(DNN_TARGET_CPU, DNN_TARGET_OPENCL, DNN_TARGET_OPENCL_FP16));
 
 typedef testing::TestWithParam<DNNTarget> Reproducibility_SqueezeNet_v1_1;
 TEST_P(Reproducibility_SqueezeNet_v1_1, Accuracy)
diff --git a/modules/dnn/test/test_tf_importer.cpp b/modules/dnn/test/test_tf_importer.cpp
index 397aadfa08..88ed53bcd9 100644
--- a/modules/dnn/test/test_tf_importer.cpp
+++ b/modules/dnn/test/test_tf_importer.cpp
@@ -295,24 +295,30 @@ TEST_P(Test_TensorFlow_nets, opencv_face_detector_uint8)
 
 INSTANTIATE_TEST_CASE_P(/**/, Test_TensorFlow_nets, availableDnnTargets());
 
-TEST(Test_TensorFlow, defun)
+typedef testing::TestWithParam<DNNTarget> Test_TensorFlow_fp16;
+
+TEST_P(Test_TensorFlow_fp16, tests)
 {
-    runTensorFlowNet("defun_dropout");
+    int targetId = GetParam();
+    const float l1 = 7e-4;
+    const float lInf = 1e-2;
+    runTensorFlowNet("fp16_single_conv", targetId, false, l1, lInf);
+    runTensorFlowNet("fp16_deconvolution", targetId, false, l1, lInf);
+    runTensorFlowNet("fp16_max_pool_odd_same", targetId, false, l1, lInf);
+    runTensorFlowNet("fp16_padding_valid", targetId, false, l1, lInf);
+    runTensorFlowNet("fp16_eltwise_add_mul", targetId, false, l1, lInf);
+    runTensorFlowNet("fp16_max_pool_odd_valid", targetId, false, l1, lInf);
+    runTensorFlowNet("fp16_pad_and_concat", targetId, false, l1, lInf);
+    runTensorFlowNet("fp16_max_pool_even", targetId, false, l1, lInf);
+    runTensorFlowNet("fp16_padding_same", targetId, false, l1, lInf);
 }
 
-TEST(Test_TensorFlow, fp16)
+INSTANTIATE_TEST_CASE_P(/**/, Test_TensorFlow_fp16,
+                        Values(DNN_TARGET_CPU, DNN_TARGET_OPENCL, DNN_TARGET_OPENCL_FP16));
+
+TEST(Test_TensorFlow, defun)
 {
-    const float l1 = 1e-3;
-    const float lInf = 1e-2;
-    runTensorFlowNet("fp16_single_conv", DNN_TARGET_CPU, false, l1, lInf);
-    runTensorFlowNet("fp16_deconvolution", DNN_TARGET_CPU, false, l1, lInf);
-    runTensorFlowNet("fp16_max_pool_odd_same", DNN_TARGET_CPU, false, l1, lInf);
-    runTensorFlowNet("fp16_padding_valid", DNN_TARGET_CPU, false, l1, lInf);
-    runTensorFlowNet("fp16_eltwise_add_mul", DNN_TARGET_CPU, false, l1, lInf);
-    runTensorFlowNet("fp16_max_pool_odd_valid", DNN_TARGET_CPU, false, l1, lInf);
-    runTensorFlowNet("fp16_pad_and_concat", DNN_TARGET_CPU, false, l1, lInf);
-    runTensorFlowNet("fp16_max_pool_even", DNN_TARGET_CPU, false, l1, lInf);
-    runTensorFlowNet("fp16_padding_same", DNN_TARGET_CPU, false, l1, lInf);
+    runTensorFlowNet("defun_dropout");
 }
 
 TEST(Test_TensorFlow, quantized)
@@ -373,9 +379,24 @@ public:
     ResizeBilinearLayer(const LayerParams &params) : Layer(params)
     {
         CV_Assert(!params.get<bool>("align_corners", false));
-        CV_Assert(blobs.size() == 1, blobs[0].type() == CV_32SC1);
-        outHeight = blobs[0].at<int>(0, 0);
-        outWidth = blobs[0].at<int>(0, 1);
+        CV_Assert(!blobs.empty());
+
+        for (size_t i = 0; i < blobs.size(); ++i)
+            CV_Assert(blobs[i].type() == CV_32SC1);
+
+        if (blobs.size() == 1)
+        {
+            CV_Assert(blobs[0].total() == 2);
+            outHeight = blobs[0].at<int>(0, 0);
+            outWidth = blobs[0].at<int>(0, 1);
+        }
+        else
+        {
+            CV_Assert(blobs.size() == 2, blobs[0].total() == 1, blobs[1].total() == 1);
+            factorHeight = blobs[0].at<int>(0, 0);
+            factorWidth = blobs[1].at<int>(0, 0);
+            outHeight = outWidth = 0;
+        }
     }
 
     static Ptr<Layer> create(LayerParams& params)
@@ -391,12 +412,21 @@ public:
         std::vector<int> outShape(4);
         outShape[0] = inputs[0][0];  // batch size
         outShape[1] = inputs[0][1];  // number of channels
-        outShape[2] = outHeight;
-        outShape[3] = outWidth;
+        outShape[2] = outHeight != 0 ? outHeight : (inputs[0][2] * factorHeight);
+        outShape[3] = outWidth != 0 ? outWidth : (inputs[0][3] * factorWidth);
         outputs.assign(1, outShape);
         return false;
     }
 
+    virtual void finalize(const std::vector<Mat*>& inputs, std::vector<Mat> &outputs) CV_OVERRIDE
+    {
+        if (!outWidth && !outHeight)
+        {
+            outHeight = outputs[0].size[2];
+            outWidth = outputs[0].size[3];
+        }
+    }
+
     // This implementation is based on a reference implementation from
     // https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
     virtual void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals) CV_OVERRIDE
@@ -447,13 +477,51 @@ private:
         return x + size[3] * (y + size[2] * (c + size[1] * b));
     }
 
-    int outWidth, outHeight;
+    int outWidth, outHeight, factorWidth, factorHeight;
 };
 
 TEST(Test_TensorFlow, resize_bilinear)
 {
     CV_DNN_REGISTER_LAYER_CLASS(ResizeBilinear, ResizeBilinearLayer);
     runTensorFlowNet("resize_bilinear");
+    runTensorFlowNet("resize_bilinear_factor");
+    LayerFactory::unregisterLayer("ResizeBilinear");
+}
+
+// inp = cv.imread('opencv_extra/testdata/cv/ximgproc/sources/08.png')
+// inp = inp[:,:,[2, 1, 0]].astype(np.float32).reshape(1, 512, 512, 3)
+// outs = sess.run([sess.graph.get_tensor_by_name('feature_fusion/Conv_7/Sigmoid:0'),
+//                  sess.graph.get_tensor_by_name('feature_fusion/concat_3:0')],
+//                 feed_dict={'input_images:0': inp})
+// scores = np.ascontiguousarray(outs[0].transpose(0, 3, 1, 2))
+// geometry = np.ascontiguousarray(outs[1].transpose(0, 3, 1, 2))
+// np.save('east_text_detection.scores.npy', scores)
+// np.save('east_text_detection.geometry.npy', geometry)
+TEST(Test_TensorFlow, EAST_text_detection)
+{
+    CV_DNN_REGISTER_LAYER_CLASS(ResizeBilinear, ResizeBilinearLayer);
+    std::string netPath = findDataFile("dnn/frozen_east_text_detection.pb", false);
+    std::string imgPath = findDataFile("cv/ximgproc/sources/08.png", false);
+    std::string refScoresPath = findDataFile("dnn/east_text_detection.scores.npy", false);
+    std::string refGeometryPath = findDataFile("dnn/east_text_detection.geometry.npy", false);
+
+    Net net = readNet(findDataFile("dnn/frozen_east_text_detection.pb", false));
+
+    Mat img = imread(imgPath);
+    Mat inp = blobFromImage(img, 1.0, Size(), Scalar(123.68, 116.78, 103.94), true, false);
+    net.setInput(inp);
+
+    std::vector<Mat> outs;
+    std::vector<String> outNames(2);
+    outNames[0] = "feature_fusion/Conv_7/Sigmoid";
+    outNames[1] = "feature_fusion/concat_3";
+    net.forward(outs, outNames);
+
+    Mat scores = outs[0];
+    Mat geometry = outs[1];
+
+    normAssert(scores, blobFromNPY(refScoresPath), "scores");
+    normAssert(geometry, blobFromNPY(refGeometryPath), "geometry", 5e-5, 1e-3);
     LayerFactory::unregisterLayer("ResizeBilinear");
 }
 
diff --git a/modules/imgproc/src/connectedcomponents.cpp b/modules/imgproc/src/connectedcomponents.cpp
index bbc27dca7d..13a2fd3293 100644
--- a/modules/imgproc/src/connectedcomponents.cpp
+++ b/modules/imgproc/src/connectedcomponents.cpp
@@ -503,7 +503,7 @@ namespace cv{
             // +-+-+-+
             // |p|q|r|
             // +-+-+-+
-            //	 |x|
+            //   |x|
             //   +-+
             const int w = imgLabels.cols, h = imgLabels.rows;
 
@@ -548,7 +548,7 @@ namespace cv{
             // +-+-+-+
             // |-|q|-|
             // +-+-+-+
-            //	 |x|
+            //   |x|
             //   +-+
             const int w = imgLabels.cols, h = imgLabels.rows;
 
@@ -2473,9 +2473,9 @@ namespace cv{
                 // |P -|Q -|R -|
                 // |- -|- -|- -|
                 // +---+---+---+
-                //	   |X -|
-                //	   |- -|
-                //	   +---+
+                //     |X -|
+                //     |- -|
+                //     +---+
                 const int w = imgLabels.cols, h = imgLabels.rows;
 
                 for (int r = chunksSizeAndLabels[0]; r < h; r = chunksSizeAndLabels[r]){
diff --git a/modules/imgproc/src/intersection.cpp b/modules/imgproc/src/intersection.cpp
index 5da743a0f9..3e4a266b30 100644
--- a/modules/imgproc/src/intersection.cpp
+++ b/modules/imgproc/src/intersection.cpp
@@ -219,13 +219,15 @@ int rotatedRectangleIntersection( const RotatedRect& rect1, const RotatedRect& r
         }
     }
 
-    // Get rid of dupes
+    // Get rid of dupes and order points.
     for( int i = 0; i < (int)intersection.size()-1; i++ )
     {
+        float dx1 = intersection[i + 1].x - intersection[i].x;
+        float dy1 = intersection[i + 1].y - intersection[i].y;
         for( size_t j = i+1; j < intersection.size(); j++ )
         {
-            float dx = intersection[i].x - intersection[j].x;
-            float dy = intersection[i].y - intersection[j].y;
+            float dx = intersection[j].x - intersection[i].x;
+            float dy = intersection[j].y - intersection[i].y;
             double d2 = dx*dx + dy*dy; // can be a really small number, need double here
 
             if( d2 < samePointEps*samePointEps )
@@ -235,6 +237,12 @@ int rotatedRectangleIntersection( const RotatedRect& rect1, const RotatedRect& r
                 intersection.pop_back();
                 j--; // restart check
             }
+            else if (dx1 * dy - dy1 * dx < 0)
+            {
+                std::swap(intersection[i + 1], intersection[j]);
+                dx1 = dx;
+                dy1 = dy;
+            }
         }
     }
 
diff --git a/modules/imgproc/test/test_intersection.cpp b/modules/imgproc/test/test_intersection.cpp
index e2d8f18e33..8d770d06ea 100644
--- a/modules/imgproc/test/test_intersection.cpp
+++ b/modules/imgproc/test/test_intersection.cpp
@@ -66,8 +66,27 @@ private:
     void test7();
     void test8();
     void test9();
+    void test10();
+    void test11();
+    void test12();
+    void test13();
+    void test14();
 };
 
+static void compare(const std::vector<Point2f>& test, const std::vector<Point2f>& target)
+{
+    ASSERT_EQ(test.size(), target.size());
+    ASSERT_TRUE(test.size() < 4 || isContourConvex(test));
+    ASSERT_TRUE(target.size() < 4 || isContourConvex(target));
+    for( size_t i = 0; i < test.size(); i++ )
+    {
+        double dx = test[i].x - target[i].x;
+        double dy = test[i].y - target[i].y;
+        double r = sqrt(dx*dx + dy*dy);
+        ASSERT_LT(r, ACCURACY);
+    }
+}
+
 void CV_RotatedRectangleIntersectionTest::run(int)
 {
     // See pics/intersection.png for the scenarios we are testing
@@ -92,28 +111,20 @@ void CV_RotatedRectangleIntersectionTest::run(int)
     test7();
     test8();
     test9();
+    test10();
+    test11();
+    test12();
+    test13();
+    test14();
 }
 
 void CV_RotatedRectangleIntersectionTest::test1()
 {
     // no intersection
-
-    RotatedRect rect1, rect2;
-
-    rect1.center.x = 0;
-    rect1.center.y = 0;
-    rect1.size.width = 2;
-    rect1.size.height = 2;
-    rect1.angle = 12.0f;
-
-    rect2.center.x = 10;
-    rect2.center.y = 10;
-    rect2.size.width = 2;
-    rect2.size.height = 2;
-    rect2.angle = 34.0f;
+    RotatedRect rect1(Point2f(0, 0), Size2f(2, 2), 12.0f);
+    RotatedRect rect2(Point2f(10, 10), Size2f(2, 2), 34.0f);
 
     vector<Point2f> vertices;
-
     int ret = rotatedRectangleIntersection(rect1, rect2, vertices);
 
     CV_Assert(ret == INTERSECT_NONE);
@@ -123,375 +134,243 @@ void CV_RotatedRectangleIntersectionTest::test1()
 void CV_RotatedRectangleIntersectionTest::test2()
 {
     // partial intersection, rectangles translated
-
-    RotatedRect rect1, rect2;
-
-    rect1.center.x = 0;
-    rect1.center.y = 0;
-    rect1.size.width = 2;
-    rect1.size.height = 2;
-    rect1.angle = 0;
-
-    rect2.center.x = 1;
-    rect2.center.y = 1;
-    rect2.size.width = 2;
-    rect2.size.height = 2;
-    rect2.angle = 0;
+    RotatedRect rect1(Point2f(0, 0), Size2f(2, 2), 0.0f);
+    RotatedRect rect2(Point2f(1, 1), Size2f(2, 2), 0.0f);
 
     vector<Point2f> vertices;
-
     int ret = rotatedRectangleIntersection(rect1, rect2, vertices);
 
     CV_Assert(ret == INTERSECT_PARTIAL);
-    CV_Assert(vertices.size() == 4);
-
-    vector<Point2f> possibleVertices(4);
 
-    possibleVertices[0] = Point2f(0.0f, 0.0f);
-    possibleVertices[1] = Point2f(1.0f, 1.0f);
-    possibleVertices[2] = Point2f(0.0f, 1.0f);
-    possibleVertices[3] = Point2f(1.0f, 0.0f);
-
-    for( size_t i = 0; i < vertices.size(); i++ )
-    {
-        double bestR = DBL_MAX;
-
-        for( size_t j = 0; j < possibleVertices.size(); j++ )
-        {
-            double dx = vertices[i].x - possibleVertices[j].x;
-            double dy = vertices[i].y - possibleVertices[j].y;
-            double r = sqrt(dx*dx + dy*dy);
-
-            bestR = std::min(bestR, r);
-        }
-
-        CV_Assert(bestR < ACCURACY);
-    }
+    vector<Point2f> targetVertices(4);
+    targetVertices[0] = Point2f(1.0f, 0.0f);
+    targetVertices[1] = Point2f(1.0f, 1.0f);
+    targetVertices[2] = Point2f(0.0f, 1.0f);
+    targetVertices[3] = Point2f(0.0f, 0.0f);
+    compare(vertices, targetVertices);
 }
 
 void CV_RotatedRectangleIntersectionTest::test3()
 {
     // partial intersection, rectangles rotated 45 degree on the corner, forms a triangle intersection
-    RotatedRect rect1, rect2;
-
-    rect1.center.x = 0;
-    rect1.center.y = 0;
-    rect1.size.width = 2;
-    rect1.size.height = 2;
-    rect1.angle = 0;
-
-    rect2.center.x = 1;
-    rect2.center.y = 1;
-    rect2.size.width = sqrt(2.0f);
-    rect2.size.height = 20;
-    rect2.angle = 45.0f;
+    RotatedRect rect1(Point2f(0, 0), Size2f(2, 2), 0.0f);
+    RotatedRect rect2(Point2f(1, 1), Size2f(sqrt(2.0f), 20), 45.0f);
 
     vector<Point2f> vertices;
-
     int ret = rotatedRectangleIntersection(rect1, rect2, vertices);
 
     CV_Assert(ret == INTERSECT_PARTIAL);
-    CV_Assert(vertices.size() == 3);
-
-    vector<Point2f> possibleVertices(3);
-
-    possibleVertices[0] = Point2f(1.0f, 1.0f);
-    possibleVertices[1] = Point2f(0.0f, 1.0f);
-    possibleVertices[2] = Point2f(1.0f, 0.0f);
-
-    for( size_t i = 0; i < vertices.size(); i++ )
-    {
-        double bestR = DBL_MAX;
-
-        for( size_t j = 0; j < possibleVertices.size(); j++ )
-        {
-            double dx = vertices[i].x - possibleVertices[j].x;
-            double dy = vertices[i].y - possibleVertices[j].y;
-            double r = sqrt(dx*dx + dy*dy);
 
-            bestR = std::min(bestR, r);
-        }
-
-        CV_Assert(bestR < ACCURACY);
-    }
+    vector<Point2f> targetVertices(3);
+    targetVertices[0] = Point2f(1.0f, 0.0f);
+    targetVertices[1] = Point2f(1.0f, 1.0f);
+    targetVertices[2] = Point2f(0.0f, 1.0f);
+    compare(vertices, targetVertices);
 }
 
 void CV_RotatedRectangleIntersectionTest::test4()
 {
     // full intersection, rectangles of same size directly on top of each other
-
-    RotatedRect rect1, rect2;
-
-    rect1.center.x = 0;
-    rect1.center.y = 0;
-    rect1.size.width = 2;
-    rect1.size.height = 2;
-    rect1.angle = 0;
-
-    rect2.center.x = 0;
-    rect2.center.y = 0;
-    rect2.size.width = 2;
-    rect2.size.height = 2;
-    rect2.angle = 0;
+    RotatedRect rect1(Point2f(0, 0), Size2f(2, 2), 0.0f);
+    RotatedRect rect2(Point2f(0, 0), Size2f(2, 2), 0.0f);
 
     vector<Point2f> vertices;
-
     int ret = rotatedRectangleIntersection(rect1, rect2, vertices);
 
     CV_Assert(ret == INTERSECT_FULL);
-    CV_Assert(vertices.size() == 4);
-
-    vector<Point2f> possibleVertices(4);
-
-    possibleVertices[0] = Point2f(-1.0f, 1.0f);
-    possibleVertices[1] = Point2f(1.0f, -1.0f);
-    possibleVertices[2] = Point2f(-1.0f, -1.0f);
-    possibleVertices[3] = Point2f(1.0f, 1.0f);
-
-    for( size_t i = 0; i < vertices.size(); i++ )
-    {
-        double bestR = DBL_MAX;
-
-        for( size_t j = 0; j < possibleVertices.size(); j++ )
-        {
-            double dx = vertices[i].x - possibleVertices[j].x;
-            double dy = vertices[i].y - possibleVertices[j].y;
-            double r = sqrt(dx*dx + dy*dy);
-
-            bestR = std::min(bestR, r);
-        }
 
-        CV_Assert(bestR < ACCURACY);
-    }
+    vector<Point2f> targetVertices(4);
+    targetVertices[0] = Point2f(-1.0f, 1.0f);
+    targetVertices[1] = Point2f(-1.0f, -1.0f);
+    targetVertices[2] = Point2f(1.0f, -1.0f);
+    targetVertices[3] = Point2f(1.0f, 1.0f);
+    compare(vertices, targetVertices);
 }
 
 void CV_RotatedRectangleIntersectionTest::test5()
 {
     // partial intersection, rectangle on top rotated 45 degrees
-
-    RotatedRect rect1, rect2;
-
-    rect1.center.x = 0;
-    rect1.center.y = 0;
-    rect1.size.width = 2;
-    rect1.size.height = 2;
-    rect1.angle = 0;
-
-    rect2.center.x = 0;
-    rect2.center.y = 0;
-    rect2.size.width = 2;
-    rect2.size.height = 2;
-    rect2.angle = 45.0f;
+    RotatedRect rect1(Point2f(0, 0), Size2f(2, 2), 0.0f);
+    RotatedRect rect2(Point2f(0, 0), Size2f(2, 2), 45.0f);
 
     vector<Point2f> vertices;
-
     int ret = rotatedRectangleIntersection(rect1, rect2, vertices);
 
     CV_Assert(ret == INTERSECT_PARTIAL);
-    CV_Assert(vertices.size() == 8);
-
-    vector<Point2f> possibleVertices(8);
 
-    possibleVertices[0] = Point2f(-1.0f, -0.414214f);
-    possibleVertices[1] = Point2f(-1.0f, 0.414214f);
-    possibleVertices[2] = Point2f(-0.414214f, -1.0f);
-    possibleVertices[3] = Point2f(0.414214f, -1.0f);
-    possibleVertices[4] = Point2f(1.0f, -0.414214f);
-    possibleVertices[5] = Point2f(1.0f, 0.414214f);
-    possibleVertices[6] = Point2f(0.414214f, 1.0f);
-    possibleVertices[7] = Point2f(-0.414214f, 1.0f);
-
-    for( size_t i = 0; i < vertices.size(); i++ )
-    {
-        double bestR = DBL_MAX;
-
-        for( size_t j = 0; j < possibleVertices.size(); j++ )
-        {
-            double dx = vertices[i].x - possibleVertices[j].x;
-            double dy = vertices[i].y - possibleVertices[j].y;
-            double r = sqrt(dx*dx + dy*dy);
-
-            bestR = std::min(bestR, r);
-        }
-
-        CV_Assert(bestR < ACCURACY);
-    }
+    vector<Point2f> targetVertices(8);
+    targetVertices[0] = Point2f(-1.0f, -0.414214f);
+    targetVertices[1] = Point2f(-0.414214f, -1.0f);
+    targetVertices[2] = Point2f(0.414214f, -1.0f);
+    targetVertices[3] = Point2f(1.0f, -0.414214f);
+    targetVertices[4] = Point2f(1.0f, 0.414214f);
+    targetVertices[5] = Point2f(0.414214f, 1.0f);
+    targetVertices[6] = Point2f(-0.414214f, 1.0f);
+    targetVertices[7] = Point2f(-1.0f, 0.414214f);
+    compare(vertices, targetVertices);
 }
 
 void CV_RotatedRectangleIntersectionTest::test6()
 {
     // 6 - partial intersection, rectangle on top of different size
-
-    RotatedRect rect1, rect2;
-
-    rect1.center.x = 0;
-    rect1.center.y = 0;
-    rect1.size.width = 2;
-    rect1.size.height = 2;
-    rect1.angle = 0;
-
-    rect2.center.x = 0;
-    rect2.center.y = 0;
-    rect2.size.width = 2;
-    rect2.size.height = 10;
-    rect2.angle = 0;
+    RotatedRect rect1(Point2f(0, 0), Size2f(2, 2), 0.0f);
+    RotatedRect rect2(Point2f(0, 0), Size2f(2, 10), 0.0f);
 
     vector<Point2f> vertices;
-
     int ret = rotatedRectangleIntersection(rect1, rect2, vertices);
 
     CV_Assert(ret == INTERSECT_PARTIAL);
-    CV_Assert(vertices.size() == 4);
-
-    vector<Point2f> possibleVertices(4);
 
-    possibleVertices[0] = Point2f(1.0f, 1.0f);
-    possibleVertices[1] = Point2f(1.0f, -1.0f);
-    possibleVertices[2] = Point2f(-1.0f, -1.0f);
-    possibleVertices[3] = Point2f(-1.0f, 1.0f);
-
-    for( size_t i = 0; i < vertices.size(); i++ )
-    {
-        double bestR = DBL_MAX;
-
-        for( size_t j = 0; j < possibleVertices.size(); j++ )
-        {
-            double dx = vertices[i].x - possibleVertices[j].x;
-            double dy = vertices[i].y - possibleVertices[j].y;
-            double r = sqrt(dx*dx + dy*dy);
-
-            bestR = std::min(bestR, r);
-        }
-
-        CV_Assert(bestR < ACCURACY);
-    }
+    vector<Point2f> targetVertices(4);
+    targetVertices[0] = Point2f(-1.0f, -1.0f);
+    targetVertices[1] = Point2f(1.0f, -1.0f);
+    targetVertices[2] = Point2f(1.0f, 1.0f);
+    targetVertices[3] = Point2f(-1.0f, 1.0f);
+    compare(vertices, targetVertices);
 }
 
 void CV_RotatedRectangleIntersectionTest::test7()
 {
     // full intersection, rectangle fully enclosed in the other
-
-    RotatedRect rect1, rect2;
-
-    rect1.center.x = 0;
-    rect1.center.y = 0;
-    rect1.size.width = 12.34f;
-    rect1.size.height = 56.78f;
-    rect1.angle = 0;
-
-    rect2.center.x = 0;
-    rect2.center.y = 0;
-    rect2.size.width = 2;
-    rect2.size.height = 2;
-    rect2.angle = 0;
+    RotatedRect rect1(Point2f(0, 0), Size2f(12.34f, 56.78f), 0.0f);
+    RotatedRect rect2(Point2f(0, 0), Size2f(2, 2), 0.0f);
 
     vector<Point2f> vertices;
-
     int ret = rotatedRectangleIntersection(rect1, rect2, vertices);
 
     CV_Assert(ret == INTERSECT_FULL);
-    CV_Assert(vertices.size() == 4);
-
-    vector<Point2f> possibleVertices(4);
 
-    possibleVertices[0] = Point2f(1.0f, 1.0f);
-    possibleVertices[1] = Point2f(1.0f, -1.0f);
-    possibleVertices[2] = Point2f(-1.0f, -1.0f);
-    possibleVertices[3] = Point2f(-1.0f, 1.0f);
-
-    for( size_t i = 0; i < vertices.size(); i++ )
-    {
-        double bestR = DBL_MAX;
+    vector<Point2f> targetVertices(4);
+    targetVertices[0] = Point2f(-1.0f, 1.0f);
+    targetVertices[1] = Point2f(-1.0f, -1.0f);
+    targetVertices[2] = Point2f(1.0f, -1.0f);
+    targetVertices[3] = Point2f(1.0f, 1.0f);
+    compare(vertices, targetVertices);
+}
 
-        for( size_t j = 0; j < possibleVertices.size(); j++ )
-        {
-            double dx = vertices[i].x - possibleVertices[j].x;
-            double dy = vertices[i].y - possibleVertices[j].y;
-            double r = sqrt(dx*dx + dy*dy);
+void CV_RotatedRectangleIntersectionTest::test8()
+{
+    // intersection by a single vertex
+    RotatedRect rect1(Point2f(0, 0), Size2f(2, 2), 0.0f);
+    RotatedRect rect2(Point2f(2, 2), Size2f(2, 2), 0.0f);
 
-            bestR = std::min(bestR, r);
-        }
+    vector<Point2f> vertices;
+    int ret = rotatedRectangleIntersection(rect1, rect2, vertices);
 
-        CV_Assert(bestR < ACCURACY);
-    }
+    CV_Assert(ret == INTERSECT_PARTIAL);
+    compare(vertices, vector<Point2f>(1, Point2f(1.0f, 1.0f)));
 }
 
-void CV_RotatedRectangleIntersectionTest::test8()
+void CV_RotatedRectangleIntersectionTest::test9()
 {
     // full intersection, rectangle fully enclosed in the other
+    RotatedRect rect1(Point2f(0, 0), Size2f(2, 2), 0.0f);
+    RotatedRect rect2(Point2f(2, 0), Size2f(2, 123.45f), 0.0f);
 
-    RotatedRect rect1, rect2;
+    vector<Point2f> vertices;
+    int ret = rotatedRectangleIntersection(rect1, rect2, vertices);
+
+    CV_Assert(ret == INTERSECT_PARTIAL);
 
-    rect1.center.x = 0;
-    rect1.center.y = 0;
-    rect1.size.width = 2;
-    rect1.size.height = 2;
-    rect1.angle = 0;
+    vector<Point2f> targetVertices(2);
+    targetVertices[0] = Point2f(1.0f, -1.0f);
+    targetVertices[1] = Point2f(1.0f, 1.0f);
+    compare(vertices, targetVertices);
+}
 
-    rect2.center.x = 2;
-    rect2.center.y = 2;
-    rect2.size.width = 2;
-    rect2.size.height = 2;
-    rect2.angle = 0;
+void CV_RotatedRectangleIntersectionTest::test10()
+{
+    // three points of rect2 are inside rect1.
+    RotatedRect rect1(Point2f(0, 0), Size2f(2, 2), 0.0f);
+    RotatedRect rect2(Point2f(0, 0.5), Size2f(1, 1), 45.0f);
 
     vector<Point2f> vertices;
-
     int ret = rotatedRectangleIntersection(rect1, rect2, vertices);
 
     CV_Assert(ret == INTERSECT_PARTIAL);
-    CV_Assert(vertices.size() == 1);
-
-    double dx = vertices[0].x - 1;
-    double dy = vertices[0].y - 1;
-    double r = sqrt(dx*dx + dy*dy);
 
-    CV_Assert(r < ACCURACY);
+    vector<Point2f> targetVertices(5);
+    targetVertices[0] = Point2f(0.207107f, 1.0f);
+    targetVertices[1] = Point2f(-0.207107f, 1.0f);
+    targetVertices[2] = Point2f(-0.707107f, 0.5f);
+    targetVertices[3] = Point2f(0.0f, -0.207107f);
+    targetVertices[4] = Point2f(0.707107f, 0.5f);
+    compare(vertices, targetVertices);
 }
 
-void CV_RotatedRectangleIntersectionTest::test9()
+void CV_RotatedRectangleIntersectionTest::test11()
 {
-    // full intersection, rectangle fully enclosed in the other
+    RotatedRect rect1(Point2f(0, 0), Size2f(4, 2), 0.0f);
+    RotatedRect rect2(Point2f(0, 0), Size2f(2, 2), -45.0f);
 
-    RotatedRect rect1, rect2;
+    vector<Point2f> vertices;
+    int ret = rotatedRectangleIntersection(rect1, rect2, vertices);
 
-    rect1.center.x = 0;
-    rect1.center.y = 0;
-    rect1.size.width = 2;
-    rect1.size.height = 2;
-    rect1.angle = 0;
+    CV_Assert(ret == INTERSECT_PARTIAL);
 
-    rect2.center.x = 2;
-    rect2.center.y = 0;
-    rect2.size.width = 2;
-    rect2.size.height = 123.45f;
-    rect2.angle = 0;
+    vector<Point2f> targetVertices(6);
+    targetVertices[0] = Point2f(-0.414214f, -1.0f);
+    targetVertices[1] = Point2f(0.414213f, -1.0f);
+    targetVertices[2] = Point2f(1.41421f, 0.0f);
+    targetVertices[3] = Point2f(0.414214f, 1.0f);
+    targetVertices[4] = Point2f(-0.414213f, 1.0f);
+    targetVertices[5] = Point2f(-1.41421f, 0.0f);
+    compare(vertices, targetVertices);
+}
 
-    vector<Point2f> vertices;
+void CV_RotatedRectangleIntersectionTest::test12()
+{
+    RotatedRect rect1(Point2f(0, 0), Size2f(2, 2), 0.0f);
+    RotatedRect rect2(Point2f(0, 1), Size2f(1, 1), 0.0f);
 
+    vector<Point2f> vertices;
     int ret = rotatedRectangleIntersection(rect1, rect2, vertices);
 
     CV_Assert(ret == INTERSECT_PARTIAL);
-    CV_Assert(vertices.size() == 2);
 
-    vector<Point2f> possibleVertices(2);
+    vector<Point2f> targetVertices(4);
+    targetVertices[0] = Point2f(-0.5f, 1.0f);
+    targetVertices[1] = Point2f(-0.5f, 0.5f);
+    targetVertices[2] = Point2f(0.5f, 0.5f);
+    targetVertices[3] = Point2f(0.5f, 1.0f);
+    compare(vertices, targetVertices);
+}
 
-    possibleVertices[0] = Point2f(1.0f, 1.0f);
-    possibleVertices[1] = Point2f(1.0f, -1.0f);
+void CV_RotatedRectangleIntersectionTest::test13()
+{
+    RotatedRect rect1(Point2f(0, 0), Size2f(1, 3), 0.0f);
+    RotatedRect rect2(Point2f(0, 1), Size2f(3, 1), 0.0f);
 
-    for( size_t i = 0; i < vertices.size(); i++ )
-    {
-        double bestR = DBL_MAX;
+    vector<Point2f> vertices;
+    int ret = rotatedRectangleIntersection(rect1, rect2, vertices);
 
-        for( size_t j = 0; j < possibleVertices.size(); j++ )
-        {
-            double dx = vertices[i].x - possibleVertices[j].x;
-            double dy = vertices[i].y - possibleVertices[j].y;
-            double r = sqrt(dx*dx + dy*dy);
+    CV_Assert(ret == INTERSECT_PARTIAL);
 
-            bestR = std::min(bestR, r);
-        }
+    vector<Point2f> targetVertices(4);
+    targetVertices[0] = Point2f(-0.5f, 0.5f);
+    targetVertices[1] = Point2f(0.5f, 0.5f);
+    targetVertices[2] = Point2f(0.5f, 1.5f);
+    targetVertices[3] = Point2f(-0.5f, 1.5f);
+    compare(vertices, targetVertices);
+}
 
-        CV_Assert(bestR < ACCURACY);
+void CV_RotatedRectangleIntersectionTest::test14()
+{
+    const int kNumTests = 100;
+    const int kWidth = 5;
+    const int kHeight = 5;
+    RotatedRect rects[2];
+    std::vector<Point2f> inter;
+    for (int i = 0; i < kNumTests; ++i)
+    {
+        for (int j = 0; j < 2; ++j)
+        {
+            rects[j].center = Point2f((float)(rand() % kWidth), (float)(rand() % kHeight));
+            rects[j].size = Size2f(rand() % kWidth + 1.0f, rand() % kHeight + 1.0f);
+            rects[j].angle = (float)(rand() % 360);
+        }
+        rotatedRectangleIntersection(rects[0], rects[1], inter);
+        ASSERT_TRUE(inter.size() < 4 || isContourConvex(inter));
     }
 }
 
diff --git a/modules/imgproc/test/test_thresh.cpp b/modules/imgproc/test/test_thresh.cpp
index 833d9e7f87..e9bed8c72e 100644
--- a/modules/imgproc/test/test_thresh.cpp
+++ b/modules/imgproc/test/test_thresh.cpp
@@ -420,4 +420,18 @@ void CV_ThreshTest::prepare_to_validation( int /*test_case_idx*/ )
 
 TEST(Imgproc_Threshold, accuracy) { CV_ThreshTest test; test.safe_run(); }
 
+BIGDATA_TEST(Imgproc_Threshold, huge)
+{
+    Mat m(65000, 40000, CV_8U);
+    ASSERT_FALSE(m.isContinuous());
+
+    uint64 i, n = (uint64)m.rows*m.cols;
+    for( i = 0; i < n; i++ )
+        m.data[i] = (uchar)(i & 255);
+
+    cv::threshold(m, m, 127, 255, cv::THRESH_BINARY);
+    int nz = cv::countNonZero(m);  // FIXIT 'int' is not enough here (overflow is possible with other inputs)
+    ASSERT_EQ((uint64)nz, n / 2);
+}
+
 }} // namespace
diff --git a/modules/photo/src/seamless_cloning_impl.cpp b/modules/photo/src/seamless_cloning_impl.cpp
index 6073a9bc4c..2d710cc61e 100644
--- a/modules/photo/src/seamless_cloning_impl.cpp
+++ b/modules/photo/src/seamless_cloning_impl.cpp
@@ -251,13 +251,15 @@ void Cloning::initVariables(const Mat &destination, const Mat &binaryMask)
     //init of the filters used in the dst
     const int w = destination.cols;
     filter_X.resize(w - 2);
+    double scale = CV_PI / (w - 1);
     for(int i = 0 ; i < w-2 ; ++i)
-        filter_X[i] = 2.0f * std::cos(static_cast<float>(CV_PI) * (i + 1) / (w - 1));
+        filter_X[i] = 2.0f * (float)std::cos(scale * (i + 1));
 
     const int h  = destination.rows;
     filter_Y.resize(h - 2);
+    scale = CV_PI / (h - 1);
     for(int j = 0 ; j < h - 2 ; ++j)
-        filter_Y[j] = 2.0f * std::cos(static_cast<float>(CV_PI) * (j + 1) / (h - 1));
+        filter_Y[j] = 2.0f * (float)std::cos(scale * (j + 1));
 }
 
 void Cloning::computeDerivatives(const Mat& destination, const Mat &patch, const Mat &binaryMask)
diff --git a/modules/photo/test/test_cloning.cpp b/modules/photo/test/test_cloning.cpp
index f83960cd63..34642d4120 100644
--- a/modules/photo/test/test_cloning.cpp
+++ b/modules/photo/test/test_cloning.cpp
@@ -53,7 +53,7 @@ namespace opencv_test { namespace {
 #define SAVE(x)
 #endif
 
-static const double numerical_precision = 1000.;
+static const double numerical_precision = 0.05; // 95% of pixels should have exact values
 
 TEST(Photo_SeamlessClone_normal, regression)
 {
@@ -82,8 +82,10 @@ TEST(Photo_SeamlessClone_normal, regression)
 
     SAVE(result);
 
-    double error = cvtest::norm(reference, result, NORM_L1);
-    EXPECT_LE(error, numerical_precision);
+    double errorINF = cvtest::norm(reference, result, NORM_INF);
+    EXPECT_LE(errorINF, 1);
+    double errorL1 = cvtest::norm(reference, result, NORM_L1);
+    EXPECT_LE(errorL1, reference.total() * numerical_precision) << "size=" << reference.size();
 }
 
 TEST(Photo_SeamlessClone_mixed, regression)
@@ -113,9 +115,10 @@ TEST(Photo_SeamlessClone_mixed, regression)
     Mat reference = imread(reference_path);
     ASSERT_FALSE(reference.empty()) << "Could not load reference image " << reference_path;
 
-    double error = cvtest::norm(reference, result, NORM_L1);
-    EXPECT_LE(error, numerical_precision);
-
+    double errorINF = cvtest::norm(reference, result, NORM_INF);
+    EXPECT_LE(errorINF, 1);
+    double errorL1 = cvtest::norm(reference, result, NORM_L1);
+    EXPECT_LE(errorL1, reference.total() * numerical_precision) << "size=" << reference.size();
 }
 
 TEST(Photo_SeamlessClone_featureExchange, regression)
@@ -145,9 +148,10 @@ TEST(Photo_SeamlessClone_featureExchange, regression)
     Mat reference = imread(reference_path);
     ASSERT_FALSE(reference.empty()) << "Could not load reference image " << reference_path;
 
-    double error = cvtest::norm(reference, result, NORM_L1);
-    EXPECT_LE(error, numerical_precision);
-
+    double errorINF = cvtest::norm(reference, result, NORM_INF);
+    EXPECT_LE(errorINF, 1);
+    double errorL1 = cvtest::norm(reference, result, NORM_L1);
+    EXPECT_LE(errorL1, reference.total() * numerical_precision) << "size=" << reference.size();
 }
 
 TEST(Photo_SeamlessClone_colorChange, regression)
@@ -171,9 +175,10 @@ TEST(Photo_SeamlessClone_colorChange, regression)
     Mat reference = imread(reference_path);
     ASSERT_FALSE(reference.empty()) << "Could not load reference image " << reference_path;
 
-    double error = cvtest::norm(reference, result, NORM_L1);
-    EXPECT_LE(error, numerical_precision);
-
+    double errorINF = cvtest::norm(reference, result, NORM_INF);
+    EXPECT_LE(errorINF, 1);
+    double errorL1 = cvtest::norm(reference, result, NORM_L1);
+    EXPECT_LE(errorL1, reference.total() * numerical_precision) << "size=" << reference.size();
 }
 
 TEST(Photo_SeamlessClone_illuminationChange, regression)
@@ -195,9 +200,12 @@ TEST(Photo_SeamlessClone_illuminationChange, regression)
     SAVE(result);
 
     Mat reference = imread(reference_path);
-    double error = cvtest::norm(reference, result, NORM_L1);
-    EXPECT_LE(error, numerical_precision);
+    ASSERT_FALSE(reference.empty()) << "Could not load reference image " << reference_path;
 
+    double errorINF = cvtest::norm(reference, result, NORM_INF);
+    EXPECT_LE(errorINF, 1);
+    double errorL1 = cvtest::norm(reference, result, NORM_L1);
+    EXPECT_LE(errorL1, reference.total() * numerical_precision) << "size=" << reference.size();
 }
 
 TEST(Photo_SeamlessClone_textureFlattening, regression)
@@ -221,9 +229,10 @@ TEST(Photo_SeamlessClone_textureFlattening, regression)
     Mat reference = imread(reference_path);
     ASSERT_FALSE(reference.empty()) << "Could not load reference image " << reference_path;
 
-    double error = cvtest::norm(reference, result, NORM_L1);
-    EXPECT_LE(error, numerical_precision);
-
+    double errorINF = cvtest::norm(reference, result, NORM_INF);
+    EXPECT_LE(errorINF, 1);
+    double errorL1 = cvtest::norm(reference, result, NORM_L1);
+    EXPECT_LE(errorL1, reference.total() * numerical_precision) << "size=" << reference.size();
 }
 
 }} // namespace
diff --git a/modules/stitching/src/blenders.cpp b/modules/stitching/src/blenders.cpp
index 33097120b4..64bea1f5ea 100644
--- a/modules/stitching/src/blenders.cpp
+++ b/modules/stitching/src/blenders.cpp
@@ -661,7 +661,7 @@ void MultiBandBlender::blend(InputOutputArray dst, InputOutputArray dst_mask)
         }
 
         // Set destination Mats to 0 so new image can be blended
-        for (size_t i = 0; i < num_bands_ + 1; ++i)
+        for (size_t i = 0; i < (size_t)(num_bands_ + 1); ++i)
         {
             gpu_dst_band_weights_[i].setTo(0);
             gpu_dst_pyr_laplace_[i].setTo(Scalar::all(0));
diff --git a/modules/ts/include/opencv2/ts/ts_ext.hpp b/modules/ts/include/opencv2/ts/ts_ext.hpp
index 11ee1c9f20..37c399515a 100644
--- a/modules/ts/include/opencv2/ts/ts_ext.hpp
+++ b/modules/ts/include/opencv2/ts/ts_ext.hpp
@@ -11,6 +11,7 @@
 namespace cvtest {
 void checkIppStatus();
 extern bool skipUnstableTests;
+extern bool runBigDataTests;
 extern int testThreads;
 }
 
@@ -43,7 +44,7 @@ extern int testThreads;
 
 
 #undef TEST
-#define TEST(test_case_name, test_name) \
+#define TEST_(test_case_name, test_name, BODY_IMPL) \
     class GTEST_TEST_CLASS_NAME_(test_case_name, test_name) : public ::testing::Test {\
      public:\
       GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {}\
@@ -65,9 +66,37 @@ extern int testThreads;
             ::testing::Test::TearDownTestCase, \
             new ::testing::internal::TestFactoryImpl<\
                 GTEST_TEST_CLASS_NAME_(test_case_name, test_name)>);\
-    void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody() CV__TEST_BODY_IMPL( #test_case_name "_" #test_name ) \
+    void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody() BODY_IMPL( #test_case_name "_" #test_name ) \
     void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::Body()
 
+#define TEST(test_case_name, test_name) TEST_(test_case_name, test_name, CV__TEST_BODY_IMPL)
+
+#define CV__TEST_BIGDATA_BODY_IMPL(name) \
+    { \
+       if (!cvtest::runBigDataTests) \
+       { \
+           printf("[     SKIP ] BigData tests are disabled\n"); \
+           return; \
+       } \
+       CV__TRACE_APP_FUNCTION_NAME(name); \
+       try { \
+          CV__TEST_INIT \
+          Body(); \
+          CV__TEST_CLEANUP \
+       } \
+       catch (cvtest::SkipTestException& e) \
+       { \
+          printf("[     SKIP ] %s\n", e.what()); \
+       } \
+    } \
+
+// Special type of tests which require / use or validate processing of huge amount of data (>= 2Gb)
+#if defined(_M_X64) || defined(__x86_64__) || defined(__aarch64__)
+#define BIGDATA_TEST(test_case_name, test_name) TEST_(BigData_ ## test_case_name, test_name, CV__TEST_BIGDATA_BODY_IMPL)
+#else
+#define BIGDATA_TEST(test_case_name, test_name) TEST_(BigData_ ## test_case_name, DISABLED_ ## test_name, CV__TEST_BIGDATA_BODY_IMPL)
+#endif
+
 #undef TEST_F
 #define TEST_F(test_fixture, test_name)\
     class GTEST_TEST_CLASS_NAME_(test_fixture, test_name) : public test_fixture {\
diff --git a/modules/ts/src/ts.cpp b/modules/ts/src/ts.cpp
index 7dd05fbf55..ee823a54a9 100644
--- a/modules/ts/src/ts.cpp
+++ b/modules/ts/src/ts.cpp
@@ -699,6 +699,7 @@ void checkIppStatus()
 }
 
 bool skipUnstableTests = false;
+bool runBigDataTests = false;
 int testThreads = 0;
 
 void parseCustomOptions(int argc, char **argv)
@@ -708,6 +709,7 @@ void parseCustomOptions(int argc, char **argv)
         "{ test_seed          |809564   |seed for random numbers generator }"
         "{ test_threads       |-1       |the number of worker threads, if parallel execution is enabled}"
         "{ skip_unstable      |false    |skip unstable tests }"
+        "{ test_bigdata       |false    |run BigData tests (>=2Gb) }"
         "{ h   help           |false    |print help info                          }";
 
     cv::CommandLineParser parser(argc, argv, command_line_keys);
@@ -730,6 +732,7 @@ void parseCustomOptions(int argc, char **argv)
     testThreads = parser.get<int>("test_threads");
 
     skipUnstableTests = parser.get<bool>("skip_unstable");
+    runBigDataTests = parser.get<bool>("test_bigdata");
 }
 
 
diff --git a/modules/videoio/src/cap.cpp b/modules/videoio/src/cap.cpp
index 67e1befca2..3c354c23d1 100644
--- a/modules/videoio/src/cap.cpp
+++ b/modules/videoio/src/cap.cpp
@@ -297,12 +297,6 @@ CV_IMPL CvCapture * cvCreateFileCaptureWithPreference (const char * filename, in
         // bail out to let the user know that it is not available
         if (apiPreference) break;
 
-#ifdef HAVE_FFMPEG
-    case CAP_FFMPEG:
-        TRY_OPEN(result, cvCreateFileCapture_FFMPEG_proxy (filename))
-        if (apiPreference) break;
-#endif
-
 #if defined HAVE_LIBV4L || defined HAVE_CAMV4L || defined HAVE_CAMV4L2 || defined HAVE_VIDEOIO
     case CAP_V4L:
         TRY_OPEN(result, cvCreateCameraCapture_V4L(filename))
@@ -383,11 +377,6 @@ static CvVideoWriter* cvCreateVideoWriterWithPreference(const char* filename, in
         default:
             //exit if the specified API is unavaliable
             if (apiPreference != CAP_ANY) break;
-        #ifdef HAVE_FFMPEG
-        case CAP_FFMPEG:
-            TRY_OPEN(result, cvCreateVideoWriter_FFMPEG_proxy (filename, fourcc, fps, frameSize, is_color))
-            if (apiPreference != CAP_ANY) break;
-        #endif
         #ifdef HAVE_MSMF
         case CAP_MSMF:
             TRY_OPEN(result, cvCreateVideoWriter_MSMF(filename, fourcc, fps, frameSize, is_color))
@@ -530,6 +519,14 @@ static Ptr<IVideoCapture> IVideoCapture_create(const String& filename, int apiPr
 {
     bool useAny = (apiPreference == CAP_ANY);
     Ptr<IVideoCapture> capture;
+#ifdef HAVE_FFMPEG
+    if (useAny || apiPreference == CAP_FFMPEG)
+    {
+        capture = cvCreateFileCapture_FFMPEG_proxy(filename);
+        if (capture && capture->isOpened())
+            return capture;
+    }
+#endif
 #ifdef HAVE_GSTREAMER
     if (useAny || apiPreference == CAP_GSTREAMER)
     {
@@ -576,6 +573,14 @@ static Ptr<IVideoCapture> IVideoCapture_create(const String& filename, int apiPr
 static Ptr<IVideoWriter> IVideoWriter_create(const String& filename, int apiPreference, int _fourcc, double fps, Size frameSize, bool isColor)
 {
     Ptr<IVideoWriter> iwriter;
+#ifdef HAVE_FFMPEG
+    if (apiPreference == CAP_FFMPEG || apiPreference == CAP_ANY)
+    {
+        iwriter = cvCreateVideoWriter_FFMPEG_proxy(filename, _fourcc, fps, frameSize, isColor);
+        if (!iwriter.empty())
+            return iwriter;
+    }
+#endif
 #ifdef HAVE_MFX
     if (apiPreference == CAP_INTEL_MFX || apiPreference == CAP_ANY)
     {
diff --git a/modules/videoio/src/cap_ffmpeg.cpp b/modules/videoio/src/cap_ffmpeg.cpp
index 5232ab1206..25f7aa60b5 100644
--- a/modules/videoio/src/cap_ffmpeg.cpp
+++ b/modules/videoio/src/cap_ffmpeg.cpp
@@ -196,11 +196,11 @@ private:
 };
 
 
-class CvCapture_FFMPEG_proxy CV_FINAL :
-    public CvCapture
+class CvCapture_FFMPEG_proxy CV_FINAL : public cv::IVideoCapture
 {
 public:
     CvCapture_FFMPEG_proxy() { ffmpegCapture = 0; }
+    CvCapture_FFMPEG_proxy(const cv::String& filename) { ffmpegCapture = 0; open(filename); }
     virtual ~CvCapture_FFMPEG_proxy() { close(); }
 
     virtual double getProperty(int propId) const CV_OVERRIDE
@@ -215,26 +215,25 @@ public:
     {
         return ffmpegCapture ? icvGrabFrame_FFMPEG_p(ffmpegCapture)!=0 : false;
     }
-    virtual IplImage* retrieveFrame(int) CV_OVERRIDE
+    virtual bool retrieveFrame(int, cv::OutputArray frame) CV_OVERRIDE
     {
         unsigned char* data = 0;
         int step=0, width=0, height=0, cn=0;
 
         if (!ffmpegCapture ||
            !icvRetrieveFrame_FFMPEG_p(ffmpegCapture, &data, &step, &width, &height, &cn))
-            return 0;
-        cvInitImageHeader(&frame, cvSize(width, height), 8, cn);
-        cvSetData(&frame, data, step);
-        return &frame;
+            return false;
+        cv::Mat(height, width, CV_MAKETYPE(CV_8U, cn), data, step).copyTo(frame);
+        return true;
     }
-    virtual bool open( const char* filename )
+    virtual bool open( const cv::String& filename )
     {
         icvInitFFMPEG::Init();
         close();
 
         if( !icvCreateFileCapture_FFMPEG_p )
             return false;
-        ffmpegCapture = icvCreateFileCapture_FFMPEG_p( filename );
+        ffmpegCapture = icvCreateFileCapture_FFMPEG_p( filename.c_str() );
         return ffmpegCapture != 0;
     }
     virtual void close()
@@ -245,44 +244,45 @@ public:
         ffmpegCapture = 0;
     }
 
+    virtual bool isOpened() const CV_OVERRIDE { return ffmpegCapture != 0; }
+    virtual int getCaptureDomain() CV_OVERRIDE { return CV_CAP_FFMPEG; }
+
 protected:
     void* ffmpegCapture;
-    IplImage frame;
 };
 
 
-CvCapture* cvCreateFileCapture_FFMPEG_proxy(const char * filename)
+cv::Ptr<cv::IVideoCapture> cv::cvCreateFileCapture_FFMPEG_proxy(const cv::String& filename)
 {
-    CvCapture_FFMPEG_proxy* result = new CvCapture_FFMPEG_proxy;
-    if( result->open( filename ))
-        return result;
-    delete result;
-    return 0;
+    cv::Ptr<CvCapture_FFMPEG_proxy> capture = cv::makePtr<CvCapture_FFMPEG_proxy>(filename);
+    if (capture && capture->isOpened())
+        return capture;
+    return cv::Ptr<cv::IVideoCapture>();
 }
 
 class CvVideoWriter_FFMPEG_proxy CV_FINAL :
-    public CvVideoWriter
+    public cv::IVideoWriter
 {
 public:
     CvVideoWriter_FFMPEG_proxy() { ffmpegWriter = 0; }
+    CvVideoWriter_FFMPEG_proxy(const cv::String& filename, int fourcc, double fps, cv::Size frameSize, bool isColor) { ffmpegWriter = 0; open(filename, fourcc, fps, frameSize, isColor); }
     virtual ~CvVideoWriter_FFMPEG_proxy() { close(); }
 
-    virtual bool writeFrame( const IplImage* image ) CV_OVERRIDE
+    virtual void write(cv::InputArray image ) CV_OVERRIDE
     {
         if(!ffmpegWriter)
-            return false;
-        CV_Assert(image->depth == 8);
+            return;
+        CV_Assert(image.depth() == CV_8U);
 
-        return icvWriteFrame_FFMPEG_p(ffmpegWriter, (const uchar*)image->imageData,
-             image->widthStep, image->width, image->height, image->nChannels, image->origin) !=0;
+        icvWriteFrame_FFMPEG_p(ffmpegWriter, (const uchar*)image.getMat().ptr(), (int)image.step(), image.cols(), image.rows(), image.channels(), 0);
     }
-    virtual bool open( const char* filename, int fourcc, double fps, CvSize frameSize, bool isColor )
+    virtual bool open( const cv::String& filename, int fourcc, double fps, cv::Size frameSize, bool isColor )
     {
         icvInitFFMPEG::Init();
         close();
         if( !icvCreateVideoWriter_FFMPEG_p )
             return false;
-        ffmpegWriter = icvCreateVideoWriter_FFMPEG_p( filename, fourcc, fps, frameSize.width, frameSize.height, isColor );
+        ffmpegWriter = icvCreateVideoWriter_FFMPEG_p( filename.c_str(), fourcc, fps, frameSize.width, frameSize.height, isColor );
         return ffmpegWriter != 0;
     }
 
@@ -294,18 +294,20 @@ public:
         ffmpegWriter = 0;
     }
 
+    virtual double getProperty(int) const CV_OVERRIDE { return 0; }
+    virtual bool setProperty(int, double) CV_OVERRIDE { return false; }
+    virtual bool isOpened() const CV_OVERRIDE { return ffmpegWriter != 0; }
+
 protected:
     void* ffmpegWriter;
 };
 
 
-CvVideoWriter* cvCreateVideoWriter_FFMPEG_proxy( const char* filename, int fourcc,
-                                          double fps, CvSize frameSize, int isColor )
+cv::Ptr<cv::IVideoWriter> cv::cvCreateVideoWriter_FFMPEG_proxy(const cv::String& filename, int fourcc,
+                                                               double fps, cv::Size frameSize, int isColor)
 {
-    CvVideoWriter_FFMPEG_proxy* result = new CvVideoWriter_FFMPEG_proxy;
-
-    if( result->open( filename, fourcc, fps, frameSize, isColor != 0 ))
-        return result;
-    delete result;
-    return 0;
+    cv::Ptr<CvVideoWriter_FFMPEG_proxy> writer = cv::makePtr<CvVideoWriter_FFMPEG_proxy>(filename, fourcc, fps, frameSize, isColor != 0);
+    if (writer && writer->isOpened())
+        return writer;
+    return cv::Ptr<cv::IVideoWriter>();
 }
diff --git a/modules/videoio/src/precomp.hpp b/modules/videoio/src/precomp.hpp
index 5ecc6c74b7..8e13080ede 100644
--- a/modules/videoio/src/precomp.hpp
+++ b/modules/videoio/src/precomp.hpp
@@ -139,12 +139,6 @@ CvVideoWriter* cvCreateVideoWriter_Images(const char* filename);
 #define CV_CAP_GSTREAMER_V4L2		2
 #define CV_CAP_GSTREAMER_FILE		3
 
-CvCapture* cvCreateFileCapture_FFMPEG_proxy(const char* filename);
-
-
-CvVideoWriter* cvCreateVideoWriter_FFMPEG_proxy( const char* filename, int fourcc,
-                                            double fps, CvSize frameSize, int is_color );
-
 CvCapture * cvCreateFileCapture_QT (const char  * filename);
 CvCapture * cvCreateCameraCapture_QT  (const int     index);
 
@@ -198,6 +192,9 @@ namespace cv
 
     Ptr<IVideoCapture> createGStreamerCapture(const String& filename);
     Ptr<IVideoCapture> createGStreamerCapture(int index);
+
+    Ptr<cv::IVideoCapture> cvCreateFileCapture_FFMPEG_proxy(const String& filename);
+    Ptr<IVideoWriter> cvCreateVideoWriter_FFMPEG_proxy(const String& filename, int fourcc, double fps, Size frameSize, int isColor);
 }
 
 #endif /* __VIDEOIO_H_ */
diff --git a/samples/cpp/gstreamer_pipeline.cpp b/samples/cpp/gstreamer_pipeline.cpp
index 19b6187ca1..0d467754b3 100644
--- a/samples/cpp/gstreamer_pipeline.cpp
+++ b/samples/cpp/gstreamer_pipeline.cpp
@@ -18,7 +18,6 @@ class GStreamerPipeline
               "{h help usage ? |           | print help messages   }"
               "{m mode         |           | coding mode (supported: encode, decode) }"
               "{p pipeline     |default    | pipeline name  (supported: 'default', 'gst-basic', 'gst-vaapi', 'gst-libav', 'ffmpeg') }"
-              "{ct container   |mp4        | container name (supported: 'mp4', 'mov', 'avi', 'mkv') }"
               "{cd codec       |h264       | codec name     (supported: 'h264', 'h265', 'mpeg2', 'mpeg4', 'mjpeg', 'vp8') }"
               "{f file path    |           | path to file }"
               "{vr resolution  |720p       | video resolution for encoding (supported: '720p', '1080p', '4k') }"
@@ -30,24 +29,29 @@ class GStreamerPipeline
         if (cmd_parser->has("help"))
         {
             cmd_parser->printMessage();
-            exit_code = -1;
+            CV_Error(Error::StsBadArg, "Called help.");
         }
 
         fast_measure = cmd_parser->has("fast");               // fast measure fps
         fix_fps      = cmd_parser->get<int>("fps");           // fixed frame per second
         pipeline     = cmd_parser->get<string>("pipeline"),   // gstreamer pipeline type
-        container    = cmd_parser->get<string>("container"),  // container type
         mode         = cmd_parser->get<string>("mode"),       // coding mode
         codec        = cmd_parser->get<string>("codec"),      // codec type
         file_name    = cmd_parser->get<string>("file"),       // path to videofile
         resolution   = cmd_parser->get<string>("resolution"); // video resolution
 
+        size_t found = file_name.rfind(".");
+        if (found != string::npos)
+        {
+            container = file_name.substr(found + 1);  // container type
+        }
+        else { CV_Error(Error::StsBadArg, "Can not parse container extension."); }
+
         if (!cmd_parser->check())
         {
             cmd_parser->printErrors();
-            exit_code = -1;
+            CV_Error(Error::StsBadArg, "Failed parse arguments.");
         }
-        exit_code = 0;
     }
 
     ~GStreamerPipeline() { delete cmd_parser; }
@@ -55,7 +59,6 @@ class GStreamerPipeline
     // Start pipeline
     int run()
     {
-        if (exit_code < 0) { return exit_code; }
         if      (mode == "decode") { if (createDecodePipeline() < 0) return -1; }
         else if (mode == "encode") { if (createEncodePipeline() < 0) return -1; }
         else
@@ -423,7 +426,6 @@ class GStreamerPipeline
            resolution;       // video resolution
     int    fix_fps;          // fixed frame per second
     Size   fix_size;         // fixed frame size
-    int    exit_code;
     VideoWriter  wrt;
     VideoCapture cap;
     ostringstream stream_pipeline;
@@ -432,6 +434,14 @@ class GStreamerPipeline
 
 int main(int argc, char *argv[])
 {
-    GStreamerPipeline pipe(argc, argv);
-    return pipe.run();
+    try
+    {
+        GStreamerPipeline pipe(argc, argv);
+        return pipe.run();
+    }
+    catch(const Exception& e)
+    {
+        cerr << e.what() << endl;
+        return 1;
+    }
 }
diff --git a/samples/cpp/openni_capture.cpp b/samples/cpp/openni_capture.cpp
index 0d0a967226..9323626a49 100644
--- a/samples/cpp/openni_capture.cpp
+++ b/samples/cpp/openni_capture.cpp
@@ -26,7 +26,7 @@ static void help()
          << endl;
 }
 
-static void colorizeDisparity( const Mat& gray, Mat& rgb, double maxDisp=-1.f, float S=1.f, float V=1.f )
+static void colorizeDisparity( const Mat& gray, Mat& rgb, double maxDisp=-1.f)
 {
     CV_Assert( !gray.empty() );
     CV_Assert( gray.type() == CV_8UC1 );
@@ -42,41 +42,9 @@ static void colorizeDisparity( const Mat& gray, Mat& rgb, double maxDisp=-1.f, f
     if( maxDisp < 1 )
         return;
 
-    for( int y = 0; y < gray.rows; y++ )
-    {
-        for( int x = 0; x < gray.cols; x++ )
-        {
-            uchar d = gray.at<uchar>(y,x);
-            unsigned int H = ((uchar)maxDisp - d) * 240 / (uchar)maxDisp;
-
-            unsigned int hi = (H/60) % 6;
-            float f = H/60.f - H/60;
-            float p = V * (1 - S);
-            float q = V * (1 - f * S);
-            float t = V * (1 - (1 - f) * S);
-
-            Point3f res;
-
-            if( hi == 0 ) //R = V,  G = t,  B = p
-                res = Point3f( p, t, V );
-            if( hi == 1 ) // R = q, G = V,  B = p
-                res = Point3f( p, V, q );
-            if( hi == 2 ) // R = p, G = V,  B = t
-                res = Point3f( t, V, p );
-            if( hi == 3 ) // R = p, G = q,  B = V
-                res = Point3f( V, q, p );
-            if( hi == 4 ) // R = t, G = p,  B = V
-                res = Point3f( V, p, t );
-            if( hi == 5 ) // R = V, G = p,  B = q
-                res = Point3f( q, p, V );
-
-            uchar b = (uchar)(std::max(0.f, std::min (res.x, 1.f)) * 255.f);
-            uchar g = (uchar)(std::max(0.f, std::min (res.y, 1.f)) * 255.f);
-            uchar r = (uchar)(std::max(0.f, std::min (res.z, 1.f)) * 255.f);
-
-            rgb.at<Point3_<uchar> >(y,x) = Point3_<uchar>(b, g, r);
-        }
-    }
+    Mat tmp;
+    convertScaleAbs(gray, tmp, 255.f / maxDisp);
+    applyColorMap(tmp, rgb, COLORMAP_JET);
 }
 
 static float getMaxDisparity( VideoCapture& capture )
diff --git a/samples/cpp/tutorial_code/HighGUI/AddingImagesTrackbar.cpp b/samples/cpp/tutorial_code/HighGUI/AddingImagesTrackbar.cpp
index 51522fa5ca..dce34e7eb4 100644
--- a/samples/cpp/tutorial_code/HighGUI/AddingImagesTrackbar.cpp
+++ b/samples/cpp/tutorial_code/HighGUI/AddingImagesTrackbar.cpp
@@ -6,9 +6,10 @@
 
 #include "opencv2/imgcodecs.hpp"
 #include "opencv2/highgui.hpp"
-#include <stdio.h>
+#include <iostream>
 
 using namespace cv;
+using std::cout;
 
 /** Global Variables */
 const int alpha_slider_max = 100;
@@ -29,11 +30,8 @@ Mat dst;
 static void on_trackbar( int, void* )
 {
    alpha = (double) alpha_slider/alpha_slider_max ;
-
    beta = ( 1.0 - alpha );
-
    addWeighted( src1, alpha, src2, beta, 0.0, dst);
-
    imshow( "Linear Blend", dst );
 }
 //![on_trackbar]
@@ -50,8 +48,8 @@ int main( void )
    src2 = imread("../data/WindowsLogo.jpg");
    //![load]
 
-   if( src1.empty() ) { printf("Error loading src1 \n"); return -1; }
-   if( src2.empty() ) { printf("Error loading src2 \n"); return -1; }
+   if( src1.empty() ) { cout << "Error loading src1 \n"; return -1; }
+   if( src2.empty() ) { cout << "Error loading src2 \n"; return -1; }
 
    /// Initialize values
    alpha_slider = 0;
diff --git a/samples/cpp/tutorial_code/ImgProc/Morphology_1.cpp b/samples/cpp/tutorial_code/ImgProc/Morphology_1.cpp
index 14526ae037..4f5ab98a93 100644
--- a/samples/cpp/tutorial_code/ImgProc/Morphology_1.cpp
+++ b/samples/cpp/tutorial_code/ImgProc/Morphology_1.cpp
@@ -31,7 +31,7 @@ void Dilation( int, void* );
 int main( int argc, char** argv )
 {
   /// Load an image
-  CommandLineParser parser( argc, argv, "{@input | ../data/chicky_512.png | input image}" );
+  CommandLineParser parser( argc, argv, "{@input | ../data/LinuxLogo.jpg | input image}" );
   src = imread( parser.get<String>( "@input" ), IMREAD_COLOR );
   if( src.empty() )
   {
diff --git a/samples/cpp/tutorial_code/ImgProc/Morphology_2.cpp b/samples/cpp/tutorial_code/ImgProc/Morphology_2.cpp
index ce71a3b118..3619753162 100644
--- a/samples/cpp/tutorial_code/ImgProc/Morphology_2.cpp
+++ b/samples/cpp/tutorial_code/ImgProc/Morphology_2.cpp
@@ -33,7 +33,7 @@ void Morphology_Operations( int, void* );
 int main( int argc, char** argv )
 {
   //![load]
-  CommandLineParser parser( argc, argv, "{@input | ../data/baboon.jpg | input image}" );
+  CommandLineParser parser( argc, argv, "{@input | ../data/LinuxLogo.jpg | input image}" );
   src = imread( parser.get<String>( "@input" ), IMREAD_COLOR );
   if (src.empty())
   {
diff --git a/samples/cpp/tutorial_code/ImgProc/Smoothing/Smoothing.cpp b/samples/cpp/tutorial_code/ImgProc/Smoothing/Smoothing.cpp
index d96b52a0c1..34d5733f75 100644
--- a/samples/cpp/tutorial_code/ImgProc/Smoothing/Smoothing.cpp
+++ b/samples/cpp/tutorial_code/ImgProc/Smoothing/Smoothing.cpp
@@ -36,52 +36,90 @@ int main( int argc, char ** argv )
     const char* filename = argc >=2 ? argv[1] : "../data/lena.jpg";
 
     src = imread( filename, IMREAD_COLOR );
-    if(src.empty()){
+    if(src.empty())
+    {
         printf(" Error opening image\n");
         printf(" Usage: ./Smoothing [image_name -- default ../data/lena.jpg] \n");
         return -1;
     }
 
-    if( display_caption( "Original Image" ) != 0 ) { return 0; }
+    if( display_caption( "Original Image" ) != 0 )
+    {
+        return 0;
+    }
 
     dst = src.clone();
-    if( display_dst( DELAY_CAPTION ) != 0 ) { return 0; }
-
+    if( display_dst( DELAY_CAPTION ) != 0 )
+    {
+        return 0;
+    }
 
     /// Applying Homogeneous blur
-    if( display_caption( "Homogeneous Blur" ) != 0 ) { return 0; }
+    if( display_caption( "Homogeneous Blur" ) != 0 )
+    {
+        return 0;
+    }
 
     //![blur]
     for ( int i = 1; i < MAX_KERNEL_LENGTH; i = i + 2 )
-    { blur( src, dst, Size( i, i ), Point(-1,-1) );
-        if( display_dst( DELAY_BLUR ) != 0 ) { return 0; } }
+    {
+        blur( src, dst, Size( i, i ), Point(-1,-1) );
+        if( display_dst( DELAY_BLUR ) != 0 )
+        {
+            return 0;
+        }
+    }
     //![blur]
 
     /// Applying Gaussian blur
-    if( display_caption( "Gaussian Blur" ) != 0 ) { return 0; }
+    if( display_caption( "Gaussian Blur" ) != 0 )
+    {
+        return 0;
+    }
 
     //![gaussianblur]
     for ( int i = 1; i < MAX_KERNEL_LENGTH; i = i + 2 )
-    { GaussianBlur( src, dst, Size( i, i ), 0, 0 );
-        if( display_dst( DELAY_BLUR ) != 0 ) { return 0; } }
+    {
+        GaussianBlur( src, dst, Size( i, i ), 0, 0 );
+        if( display_dst( DELAY_BLUR ) != 0 )
+        {
+            return 0;
+        }
+    }
     //![gaussianblur]
 
     /// Applying Median blur
-    if( display_caption( "Median Blur" ) != 0 ) { return 0; }
+    if( display_caption( "Median Blur" ) != 0 )
+    {
+        return 0;
+    }
 
     //![medianblur]
     for ( int i = 1; i < MAX_KERNEL_LENGTH; i = i + 2 )
-    { medianBlur ( src, dst, i );
-        if( display_dst( DELAY_BLUR ) != 0 ) { return 0; } }
+    {
+        medianBlur ( src, dst, i );
+        if( display_dst( DELAY_BLUR ) != 0 )
+        {
+            return 0;
+        }
+    }
     //![medianblur]
 
     /// Applying Bilateral Filter
-    if( display_caption( "Bilateral Blur" ) != 0 ) { return 0; }
+    if( display_caption( "Bilateral Blur" ) != 0 )
+    {
+        return 0;
+    }
 
     //![bilateralfilter]
     for ( int i = 1; i < MAX_KERNEL_LENGTH; i = i + 2 )
-    { bilateralFilter ( src, dst, i, i*2, i/2 );
-        if( display_dst( DELAY_BLUR ) != 0 ) { return 0; } }
+    {
+        bilateralFilter ( src, dst, i, i*2, i/2 );
+        if( display_dst( DELAY_BLUR ) != 0 )
+        {
+            return 0;
+        }
+    }
     //![bilateralfilter]
 
     /// Done
diff --git a/samples/cpp/tutorial_code/dnn/custom_layers.cpp b/samples/dnn/custom_layers.hpp
similarity index 81%
rename from samples/cpp/tutorial_code/dnn/custom_layers.cpp
rename to samples/dnn/custom_layers.hpp
index 217e53659f..918cc8ae46 100644
--- a/samples/cpp/tutorial_code/dnn/custom_layers.cpp
+++ b/samples/dnn/custom_layers.hpp
@@ -1,35 +1,8 @@
-#include <opencv2/dnn.hpp>
-
-//! [A custom layer interface]
-class MyLayer : public cv::dnn::Layer
-{
-public:
-    //! [MyLayer::MyLayer]
-    MyLayer(const cv::dnn::LayerParams &params);
-    //! [MyLayer::MyLayer]
-
-    //! [MyLayer::create]
-    static cv::Ptr<cv::dnn::Layer> create(cv::dnn::LayerParams& params);
-    //! [MyLayer::create]
-
-    //! [MyLayer::getMemoryShapes]
-    virtual bool getMemoryShapes(const std::vector<std::vector<int> > &inputs,
-                                 const int requiredOutputs,
-                                 std::vector<std::vector<int> > &outputs,
-                                 std::vector<std::vector<int> > &internals) const CV_OVERRIDE;
-    //! [MyLayer::getMemoryShapes]
-
-    //! [MyLayer::forward]
-    virtual void forward(std::vector<cv::Mat*> &inputs, std::vector<cv::Mat> &outputs, std::vector<cv::Mat> &internals) CV_OVERRIDE;
-    //! [MyLayer::forward]
-
-    //! [MyLayer::finalize]
-    virtual void finalize(const std::vector<cv::Mat*> &inputs, std::vector<cv::Mat> &outputs) CV_OVERRIDE;
-    //! [MyLayer::finalize]
+#ifndef __OPENCV_SAMPLES_DNN_CUSTOM_LAYERS__
+#define __OPENCV_SAMPLES_DNN_CUSTOM_LAYERS__
 
-    virtual void forward(cv::InputArrayOfArrays inputs, cv::OutputArrayOfArrays outputs, cv::OutputArrayOfArrays internals) CV_OVERRIDE;
-};
-//! [A custom layer interface]
+#include <opencv2/dnn.hpp>
+#include <opencv2/dnn/shape_utils.hpp>    // getPlane
 
 //! [InterpLayer]
 class InterpLayer : public cv::dnn::Layer
@@ -113,15 +86,33 @@ private:
 //! [InterpLayer]
 
 //! [ResizeBilinearLayer]
-class ResizeBilinearLayer : public cv::dnn::Layer
+class ResizeBilinearLayer CV_FINAL : public cv::dnn::Layer
 {
 public:
     ResizeBilinearLayer(const cv::dnn::LayerParams &params) : Layer(params)
     {
         CV_Assert(!params.get<bool>("align_corners", false));
-        CV_Assert(blobs.size() == 1, blobs[0].type() == CV_32SC1);
-        outHeight = blobs[0].at<int>(0, 0);
-        outWidth = blobs[0].at<int>(0, 1);
+        CV_Assert(!blobs.empty());
+
+        for (size_t i = 0; i < blobs.size(); ++i)
+            CV_Assert(blobs[i].type() == CV_32SC1);
+
+        // There are two cases of input blob: a single blob which contains output
+        // shape and two blobs with scaling factors.
+        if (blobs.size() == 1)
+        {
+            CV_Assert(blobs[0].total() == 2);
+            outHeight = blobs[0].at<int>(0, 0);
+            outWidth = blobs[0].at<int>(0, 1);
+            factorHeight = factorWidth = 0;
+        }
+        else
+        {
+            CV_Assert(blobs.size() == 2, blobs[0].total() == 1, blobs[1].total() == 1);
+            factorHeight = blobs[0].at<int>(0, 0);
+            factorWidth = blobs[1].at<int>(0, 0);
+            outHeight = outWidth = 0;
+        }
     }
 
     static cv::Ptr<cv::dnn::Layer> create(cv::dnn::LayerParams& params)
@@ -130,25 +121,32 @@ public:
     }
 
     virtual bool getMemoryShapes(const std::vector<std::vector<int> > &inputs,
-                                 const int requiredOutputs,
+                                 const int,
                                  std::vector<std::vector<int> > &outputs,
-                                 std::vector<std::vector<int> > &internals) const CV_OVERRIDE
+                                 std::vector<std::vector<int> > &) const CV_OVERRIDE
     {
-        CV_UNUSED(requiredOutputs); CV_UNUSED(internals);
         std::vector<int> outShape(4);
         outShape[0] = inputs[0][0];  // batch size
         outShape[1] = inputs[0][1];  // number of channels
-        outShape[2] = outHeight;
-        outShape[3] = outWidth;
+        outShape[2] = outHeight != 0 ? outHeight : (inputs[0][2] * factorHeight);
+        outShape[3] = outWidth != 0 ? outWidth : (inputs[0][3] * factorWidth);
         outputs.assign(1, outShape);
         return false;
     }
 
+    virtual void finalize(const std::vector<cv::Mat*>&, std::vector<cv::Mat> &outputs) CV_OVERRIDE
+    {
+        if (!outWidth && !outHeight)
+        {
+            outHeight = outputs[0].size[2];
+            outWidth = outputs[0].size[3];
+        }
+    }
+
     // This implementation is based on a reference implementation from
     // https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
-    virtual void forward(std::vector<cv::Mat*> &inputs, std::vector<cv::Mat> &outputs, std::vector<cv::Mat> &internals) CV_OVERRIDE
+    virtual void forward(std::vector<cv::Mat*> &inputs, std::vector<cv::Mat> &outputs, std::vector<cv::Mat> &) CV_OVERRIDE
     {
-        CV_UNUSED(internals);
         cv::Mat& inp = *inputs[0];
         cv::Mat& out = outputs[0];
         const float* inpData = (float*)inp.data;
@@ -195,19 +193,54 @@ private:
         return x + size[3] * (y + size[2] * (c + size[1] * b));
     }
 
-    int outWidth, outHeight;
+    int outWidth, outHeight, factorWidth, factorHeight;
 };
 //! [ResizeBilinearLayer]
 
+//
+// The folowing code is used only to generate tutorials documentation.
+//
+
+//! [A custom layer interface]
+class MyLayer : public cv::dnn::Layer
+{
+public:
+    //! [MyLayer::MyLayer]
+    MyLayer(const cv::dnn::LayerParams &params);
+    //! [MyLayer::MyLayer]
+
+    //! [MyLayer::create]
+    static cv::Ptr<cv::dnn::Layer> create(cv::dnn::LayerParams& params);
+    //! [MyLayer::create]
+
+    //! [MyLayer::getMemoryShapes]
+    virtual bool getMemoryShapes(const std::vector<std::vector<int> > &inputs,
+                                 const int requiredOutputs,
+                                 std::vector<std::vector<int> > &outputs,
+                                 std::vector<std::vector<int> > &internals) const CV_OVERRIDE;
+    //! [MyLayer::getMemoryShapes]
+
+    //! [MyLayer::forward]
+    virtual void forward(std::vector<cv::Mat*> &inputs, std::vector<cv::Mat> &outputs, std::vector<cv::Mat> &internals) CV_OVERRIDE;
+    //! [MyLayer::forward]
+
+    //! [MyLayer::finalize]
+    virtual void finalize(const std::vector<cv::Mat*> &inputs, std::vector<cv::Mat> &outputs) CV_OVERRIDE;
+    //! [MyLayer::finalize]
+
+    virtual void forward(cv::InputArrayOfArrays inputs, cv::OutputArrayOfArrays outputs, cv::OutputArrayOfArrays internals) CV_OVERRIDE;
+};
+//! [A custom layer interface]
+
 //! [Register a custom layer]
-#include <opencv2/dnn/layer.details.hpp>  // CV_DNN_REGISTER_LAYER_CLASS macro
+#include <opencv2/dnn/layer.details.hpp>  // CV_DNN_REGISTER_LAYER_CLASS
 
-int main(int argc, char** argv)
+static inline void loadNet()
 {
-    CV_DNN_REGISTER_LAYER_CLASS(MyType, MyLayer);
+    CV_DNN_REGISTER_LAYER_CLASS(Interp, InterpLayer);
     // ...
     //! [Register a custom layer]
-    CV_UNUSED(argc); CV_UNUSED(argv);
+
     //! [Register InterpLayer]
     CV_DNN_REGISTER_LAYER_CLASS(Interp, InterpLayer);
     cv::dnn::Net caffeNet = cv::dnn::readNet("/path/to/config.prototxt", "/path/to/weights.caffemodel");
@@ -217,16 +250,8 @@ int main(int argc, char** argv)
     CV_DNN_REGISTER_LAYER_CLASS(ResizeBilinear, ResizeBilinearLayer);
     cv::dnn::Net tfNet = cv::dnn::readNet("/path/to/graph.pb");
     //! [Register ResizeBilinearLayer]
-}
 
-cv::Ptr<cv::dnn::Layer> MyLayer::create(cv::dnn::LayerParams& params)
-{
-    return cv::Ptr<cv::dnn::Layer>(new MyLayer(params));
+    if (false) loadNet();  // To prevent unused function warning.
 }
-MyLayer::MyLayer(const cv::dnn::LayerParams&) {}
-bool MyLayer::getMemoryShapes(const std::vector<std::vector<int> >&, const int,
-                              std::vector<std::vector<int> >&,
-                              std::vector<std::vector<int> >&) const { return false; }
-void MyLayer::forward(std::vector<cv::Mat*>&, std::vector<cv::Mat>&, std::vector<cv::Mat>&) {}
-void MyLayer::finalize(const std::vector<cv::Mat*>&, std::vector<cv::Mat>&) {}
-void MyLayer::forward(cv::InputArrayOfArrays, cv::OutputArrayOfArrays, cv::OutputArrayOfArrays) {}
+
+#endif  // __OPENCV_SAMPLES_DNN_CUSTOM_LAYERS__
diff --git a/samples/dnn/text_detection.cpp b/samples/dnn/text_detection.cpp
new file mode 100644
index 0000000000..5abe6b6884
--- /dev/null
+++ b/samples/dnn/text_detection.cpp
@@ -0,0 +1,159 @@
+#include <opencv2/imgproc.hpp>
+#include <opencv2/highgui.hpp>
+#include <opencv2/dnn.hpp>
+
+#include "custom_layers.hpp"
+
+using namespace cv;
+using namespace cv::dnn;
+
+const char* keys =
+    "{ help  h     | | Print help message. }"
+    "{ input i     | | Path to input image or video file. Skip this argument to capture frames from a camera.}"
+    "{ model m     | | Path to a binary .pb file contains trained network.}"
+    "{ width       | 320 | Preprocess input image by resizing to a specific width. It should be multiple by 32. }"
+    "{ height      | 320 | Preprocess input image by resizing to a specific height. It should be multiple by 32. }"
+    "{ thr         | 0.5 | Confidence threshold. }"
+    "{ nms         | 0.4 | Non-maximum suppression threshold. }";
+
+void decode(const Mat& scores, const Mat& geometry, float scoreThresh,
+            std::vector<RotatedRect>& detections, std::vector<float>& confidences);
+
+int main(int argc, char** argv)
+{
+    // Parse command line arguments.
+    CommandLineParser parser(argc, argv, keys);
+    parser.about("Use this script to run TensorFlow implementation (https://github.com/argman/EAST) of "
+                  "EAST: An Efficient and Accurate Scene Text Detector (https://arxiv.org/abs/1704.03155v2)");
+    if (argc == 1 || parser.has("help"))
+    {
+        parser.printMessage();
+        return 0;
+    }
+
+    float confThreshold = parser.get<float>("thr");
+    float nmsThreshold = parser.get<float>("nms");
+    int inpWidth = parser.get<int>("width");
+    int inpHeight = parser.get<int>("height");
+    CV_Assert(parser.has("model"));
+    String model = parser.get<String>("model");
+
+    // Register a custom layer.
+    CV_DNN_REGISTER_LAYER_CLASS(ResizeBilinear, ResizeBilinearLayer);
+
+    // Load network.
+    Net net = readNet(model);
+
+    // Open a video file or an image file or a camera stream.
+    VideoCapture cap;
+    if (parser.has("input"))
+        cap.open(parser.get<String>("input"));
+    else
+        cap.open(0);
+
+    static const std::string kWinName = "EAST: An Efficient and Accurate Scene Text Detector";
+    namedWindow(kWinName, WINDOW_NORMAL);
+
+    std::vector<Mat> outs;
+    std::vector<String> outNames(2);
+    outNames[0] = "feature_fusion/Conv_7/Sigmoid";
+    outNames[1] = "feature_fusion/concat_3";
+
+    Mat frame, blob;
+    while (waitKey(1) < 0)
+    {
+        cap >> frame;
+        if (frame.empty())
+        {
+            waitKey();
+            break;
+        }
+
+        blobFromImage(frame, blob, 1.0, Size(inpWidth, inpHeight), Scalar(123.68, 116.78, 103.94), true, false);
+        net.setInput(blob);
+        net.forward(outs, outNames);
+
+        Mat scores = outs[0];
+        Mat geometry = outs[1];
+
+        // Decode predicted bounding boxes.
+        std::vector<RotatedRect> boxes;
+        std::vector<float> confidences;
+        decode(scores, geometry, confThreshold, boxes, confidences);
+
+        // Apply non-maximum suppression procedure.
+        std::vector<int> indices;
+        NMSBoxes(boxes, confidences, confThreshold, nmsThreshold, indices);
+
+        // Render detections.
+        Point2f ratio((float)frame.cols / inpWidth, (float)frame.rows / inpHeight);
+        for (size_t i = 0; i < indices.size(); ++i)
+        {
+            RotatedRect& box = boxes[indices[i]];
+
+            Point2f vertices[4];
+            box.points(vertices);
+            for (int j = 0; j < 4; ++j)
+            {
+                vertices[j].x *= ratio.x;
+                vertices[j].y *= ratio.y;
+            }
+            for (int j = 0; j < 4; ++j)
+                line(frame, vertices[j], vertices[(j + 1) % 4], Scalar(0, 255, 0), 1);
+        }
+
+        // Put efficiency information.
+        std::vector<double> layersTimes;
+        double freq = getTickFrequency() / 1000;
+        double t = net.getPerfProfile(layersTimes) / freq;
+        std::string label = format("Inference time: %.2f ms", t);
+        putText(frame, label, Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
+
+        imshow(kWinName, frame);
+    }
+    return 0;
+}
+
+void decode(const Mat& scores, const Mat& geometry, float scoreThresh,
+            std::vector<RotatedRect>& detections, std::vector<float>& confidences)
+{
+    detections.clear();
+    CV_Assert(scores.dims == 4, geometry.dims == 4, scores.size[0] == 1,
+              geometry.size[0] == 1, scores.size[1] == 1, geometry.size[1] == 5,
+              scores.size[2] == geometry.size[2], scores.size[3] == geometry.size[3]);
+
+    const int height = scores.size[2];
+    const int width = scores.size[3];
+    for (int y = 0; y < height; ++y)
+    {
+        const float* scoresData = scores.ptr<float>(0, 0, y);
+        const float* x0_data = geometry.ptr<float>(0, 0, y);
+        const float* x1_data = geometry.ptr<float>(0, 1, y);
+        const float* x2_data = geometry.ptr<float>(0, 2, y);
+        const float* x3_data = geometry.ptr<float>(0, 3, y);
+        const float* anglesData = geometry.ptr<float>(0, 4, y);
+        for (int x = 0; x < width; ++x)
+        {
+            float score = scoresData[x];
+            if (score < scoreThresh)
+                continue;
+
+            // Decode a prediction.
+            // Multiple by 4 because feature maps are 4 time less than input image.
+            float offsetX = x * 4.0f, offsetY = y * 4.0f;
+            float angle = anglesData[x];
+            float cosA = std::cos(angle);
+            float sinA = std::sin(angle);
+            float h = x0_data[x] + x2_data[x];
+            float w = x1_data[x] + x3_data[x];
+
+            Point2f offset(offsetX + cosA * x1_data[x] + sinA * x2_data[x],
+                           offsetY - sinA * x1_data[x] + cosA * x2_data[x]);
+            Point2f p1 = Point2f(-sinA * h, -cosA * h) + offset;
+            Point2f p3 = Point2f(-cosA * w, sinA * w) + offset;
+            RotatedRect r(0.5f * (p1 + p3), Size2f(w, h), -angle * 180.0f / (float)CV_PI);
+            detections.push_back(r);
+            confidences.push_back(score);
+        }
+    }
+}
diff --git a/samples/java/tutorial_code/ImgProc/erosion_dilatation/MorphologyDemo1.java b/samples/java/tutorial_code/ImgProc/erosion_dilatation/MorphologyDemo1.java
new file mode 100644
index 0000000000..8a52884984
--- /dev/null
+++ b/samples/java/tutorial_code/ImgProc/erosion_dilatation/MorphologyDemo1.java
@@ -0,0 +1,159 @@
+import java.awt.BorderLayout;
+import java.awt.Container;
+import java.awt.event.ActionEvent;
+import java.awt.event.ActionListener;
+import java.awt.image.BufferedImage;
+import java.awt.image.DataBufferByte;
+
+import javax.swing.BoxLayout;
+import javax.swing.ImageIcon;
+import javax.swing.JComboBox;
+import javax.swing.JFrame;
+import javax.swing.JLabel;
+import javax.swing.JPanel;
+import javax.swing.JSlider;
+import javax.swing.event.ChangeEvent;
+import javax.swing.event.ChangeListener;
+
+import org.opencv.core.Core;
+import org.opencv.core.Mat;
+import org.opencv.core.Point;
+import org.opencv.core.Size;
+import org.opencv.imgcodecs.Imgcodecs;
+import org.opencv.imgproc.Imgproc;
+
+public class MorphologyDemo1 {
+    private static final String[] ELEMENT_TYPE = { "Rectangle", "Cross", "Ellipse" };
+    private static final String[] MORPH_OP = { "Erosion", "Dilatation" };
+    private static final int MAX_KERNEL_SIZE = 21;
+    private Mat matImgSrc;
+    private Mat matImgDst = new Mat();
+    private int elementType = Imgproc.CV_SHAPE_RECT;
+    private int kernelSize = 0;
+    private boolean doErosion = true;
+    private JFrame frame;
+    private JLabel imgLabel;
+
+    public MorphologyDemo1(String[] args) {
+        String imagePath = args.length > 0 ? args[0] : "../data/LinuxLogo.jpg";
+        matImgSrc = Imgcodecs.imread(imagePath);
+        if (matImgSrc.empty()) {
+            System.out.println("Empty image: " + imagePath);
+            System.exit(0);
+        }
+
+        // Create and set up the window.
+        frame = new JFrame("Erosion and dilatation demo");
+        frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
+        // Set up the content pane.
+        BufferedImage img = toBufferedImage(matImgSrc);
+        addComponentsToPane(frame.getContentPane(), img);
+        // Use the content pane's default BorderLayout. No need for
+        // setLayout(new BorderLayout());
+        // Display the window.
+        frame.pack();
+        frame.setVisible(true);
+    }
+
+    private void addComponentsToPane(Container pane, BufferedImage img) {
+        if (!(pane.getLayout() instanceof BorderLayout)) {
+            pane.add(new JLabel("Container doesn't use BorderLayout!"));
+            return;
+        }
+
+        JPanel sliderPanel = new JPanel();
+        sliderPanel.setLayout(new BoxLayout(sliderPanel, BoxLayout.PAGE_AXIS));
+
+        JComboBox<String> elementTypeBox = new JComboBox<>(ELEMENT_TYPE);
+        elementTypeBox.addActionListener(new ActionListener() {
+            @Override
+            public void actionPerformed(ActionEvent e) {
+                @SuppressWarnings("unchecked")
+                JComboBox<String> cb = (JComboBox<String>)e.getSource();
+                if (cb.getSelectedIndex() == 0) {
+                    elementType = Imgproc.CV_SHAPE_RECT;
+                } else if (cb.getSelectedIndex() == 1) {
+                    elementType = Imgproc.CV_SHAPE_CROSS;
+                } else if (cb.getSelectedIndex() == 2) {
+                    elementType = Imgproc.CV_SHAPE_ELLIPSE;
+                }
+                update();
+            }
+        });
+        sliderPanel.add(elementTypeBox);
+
+        sliderPanel.add(new JLabel("Kernel size: 2n + 1"));
+        JSlider slider = new JSlider(0, MAX_KERNEL_SIZE, 0);
+        slider.setMajorTickSpacing(5);
+        slider.setMinorTickSpacing(5);
+        slider.setPaintTicks(true);
+        slider.setPaintLabels(true);
+        slider.addChangeListener(new ChangeListener() {
+            @Override
+            public void stateChanged(ChangeEvent e) {
+                JSlider source = (JSlider) e.getSource();
+                kernelSize = source.getValue();
+                update();
+            }
+        });
+        sliderPanel.add(slider);
+
+        JComboBox<String> morphOpBox = new JComboBox<>(MORPH_OP);
+        morphOpBox.addActionListener(new ActionListener() {
+            @Override
+            public void actionPerformed(ActionEvent e) {
+                @SuppressWarnings("unchecked")
+                JComboBox<String> cb = (JComboBox<String>)e.getSource();
+                doErosion = cb.getSelectedIndex() == 0;
+                update();
+            }
+        });
+        sliderPanel.add(morphOpBox);
+
+        pane.add(sliderPanel, BorderLayout.PAGE_START);
+        imgLabel = new JLabel(new ImageIcon(img));
+        pane.add(imgLabel, BorderLayout.CENTER);
+    }
+
+    private BufferedImage toBufferedImage(Mat matrix) {
+        int type = BufferedImage.TYPE_BYTE_GRAY;
+        if (matrix.channels() > 1) {
+            type = BufferedImage.TYPE_3BYTE_BGR;
+        }
+        int bufferSize = matrix.channels() * matrix.cols() * matrix.rows();
+        byte[] buffer = new byte[bufferSize];
+        matrix.get(0, 0, buffer); // get all the pixels
+        BufferedImage image = new BufferedImage(matrix.cols(), matrix.rows(), type);
+        final byte[] targetPixels = ((DataBufferByte) image.getRaster().getDataBuffer()).getData();
+        System.arraycopy(buffer, 0, targetPixels, 0, buffer.length);
+        return image;
+    }
+
+    private void update() {
+        Mat element = Imgproc.getStructuringElement(elementType, new Size(2 * kernelSize + 1, 2 * kernelSize + 1),
+                new Point(kernelSize, kernelSize));
+
+        if (doErosion) {
+            Imgproc.erode(matImgSrc, matImgDst, element);
+        } else {
+            Imgproc.dilate(matImgSrc, matImgDst, element);
+        }
+        BufferedImage img = toBufferedImage(matImgDst);
+        imgLabel.setIcon(new ImageIcon(img));
+        frame.repaint();
+    }
+
+    public static void main(String[] args) {
+        // Load the native OpenCV library
+        System.loadLibrary(Core.NATIVE_LIBRARY_NAME);
+
+        // Schedule a job for the event dispatch thread:
+        // creating and showing this application's GUI.
+        javax.swing.SwingUtilities.invokeLater(new Runnable() {
+            @Override
+            public void run() {
+                new MorphologyDemo1(args);
+            }
+        });
+    }
+}
diff --git a/samples/java/tutorial_code/ImgProc/opening_closing_hats/MorphologyDemo2.java b/samples/java/tutorial_code/ImgProc/opening_closing_hats/MorphologyDemo2.java
new file mode 100644
index 0000000000..07d2f6e44d
--- /dev/null
+++ b/samples/java/tutorial_code/ImgProc/opening_closing_hats/MorphologyDemo2.java
@@ -0,0 +1,157 @@
+import java.awt.BorderLayout;
+import java.awt.Container;
+import java.awt.event.ActionEvent;
+import java.awt.event.ActionListener;
+import java.awt.image.BufferedImage;
+import java.awt.image.DataBufferByte;
+
+import javax.swing.BoxLayout;
+import javax.swing.ImageIcon;
+import javax.swing.JComboBox;
+import javax.swing.JFrame;
+import javax.swing.JLabel;
+import javax.swing.JPanel;
+import javax.swing.JSlider;
+import javax.swing.event.ChangeEvent;
+import javax.swing.event.ChangeListener;
+
+import org.opencv.core.Core;
+import org.opencv.core.Mat;
+import org.opencv.core.Point;
+import org.opencv.core.Size;
+import org.opencv.imgcodecs.Imgcodecs;
+import org.opencv.imgproc.Imgproc;
+
+public class MorphologyDemo2 {
+    private static final String[] MORPH_OP = { "Opening", "Closing", "Gradient", "Top Hat", "Black Hat" };
+    private static final int[] MORPH_OP_TYPE = { Imgproc.MORPH_OPEN, Imgproc.MORPH_CLOSE,
+            Imgproc.MORPH_GRADIENT, Imgproc.MORPH_TOPHAT, Imgproc.MORPH_BLACKHAT };
+    private static final String[] ELEMENT_TYPE = { "Rectangle", "Cross", "Ellipse" };
+    private static final int MAX_KERNEL_SIZE = 21;
+    private Mat matImgSrc;
+    private Mat matImgDst = new Mat();
+    private int morphOpType = Imgproc.MORPH_OPEN;
+    private int elementType = Imgproc.CV_SHAPE_RECT;
+    private int kernelSize = 0;
+    private JFrame frame;
+    private JLabel imgLabel;
+
+    public MorphologyDemo2(String[] args) {
+        String imagePath = args.length > 0 ? args[0] : "../data/LinuxLogo.jpg";
+        matImgSrc = Imgcodecs.imread(imagePath);
+        if (matImgSrc.empty()) {
+            System.out.println("Empty image: " + imagePath);
+            System.exit(0);
+        }
+
+        // Create and set up the window.
+        frame = new JFrame("Morphology Transformations demo");
+        frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
+        // Set up the content pane.
+        BufferedImage img = toBufferedImage(matImgSrc);
+        addComponentsToPane(frame.getContentPane(), img);
+        // Use the content pane's default BorderLayout. No need for
+        // setLayout(new BorderLayout());
+        // Display the window.
+        frame.pack();
+        frame.setVisible(true);
+    }
+
+    private void addComponentsToPane(Container pane, BufferedImage img) {
+        if (!(pane.getLayout() instanceof BorderLayout)) {
+            pane.add(new JLabel("Container doesn't use BorderLayout!"));
+            return;
+        }
+
+        JPanel sliderPanel = new JPanel();
+        sliderPanel.setLayout(new BoxLayout(sliderPanel, BoxLayout.PAGE_AXIS));
+
+        JComboBox<String> morphOpBox = new JComboBox<>(MORPH_OP);
+        morphOpBox.addActionListener(new ActionListener() {
+            @Override
+            public void actionPerformed(ActionEvent e) {
+                @SuppressWarnings("unchecked")
+                JComboBox<String> cb = (JComboBox<String>)e.getSource();
+                morphOpType = MORPH_OP_TYPE[cb.getSelectedIndex()];
+                update();
+            }
+        });
+        sliderPanel.add(morphOpBox);
+
+        JComboBox<String> elementTypeBox = new JComboBox<>(ELEMENT_TYPE);
+        elementTypeBox.addActionListener(new ActionListener() {
+            @Override
+            public void actionPerformed(ActionEvent e) {
+                @SuppressWarnings("unchecked")
+                JComboBox<String> cb = (JComboBox<String>)e.getSource();
+                if (cb.getSelectedIndex() == 0) {
+                    elementType = Imgproc.CV_SHAPE_RECT;
+                } else if (cb.getSelectedIndex() == 1) {
+                    elementType = Imgproc.CV_SHAPE_CROSS;
+                } else if (cb.getSelectedIndex() == 2) {
+                    elementType = Imgproc.CV_SHAPE_ELLIPSE;
+                }
+                update();
+            }
+        });
+        sliderPanel.add(elementTypeBox);
+
+        sliderPanel.add(new JLabel("Kernel size: 2n + 1"));
+        JSlider slider = new JSlider(0, MAX_KERNEL_SIZE, 0);
+        slider.setMajorTickSpacing(5);
+        slider.setMinorTickSpacing(5);
+        slider.setPaintTicks(true);
+        slider.setPaintLabels(true);
+        slider.addChangeListener(new ChangeListener() {
+            @Override
+            public void stateChanged(ChangeEvent e) {
+                JSlider source = (JSlider) e.getSource();
+                kernelSize = source.getValue();
+                update();
+            }
+        });
+        sliderPanel.add(slider);
+
+        pane.add(sliderPanel, BorderLayout.PAGE_START);
+        imgLabel = new JLabel(new ImageIcon(img));
+        pane.add(imgLabel, BorderLayout.CENTER);
+    }
+
+    private BufferedImage toBufferedImage(Mat matrix) {
+        int type = BufferedImage.TYPE_BYTE_GRAY;
+        if (matrix.channels() > 1) {
+            type = BufferedImage.TYPE_3BYTE_BGR;
+        }
+        int bufferSize = matrix.channels() * matrix.cols() * matrix.rows();
+        byte[] buffer = new byte[bufferSize];
+        matrix.get(0, 0, buffer); // get all the pixels
+        BufferedImage image = new BufferedImage(matrix.cols(), matrix.rows(), type);
+        final byte[] targetPixels = ((DataBufferByte) image.getRaster().getDataBuffer()).getData();
+        System.arraycopy(buffer, 0, targetPixels, 0, buffer.length);
+        return image;
+    }
+
+    private void update() {
+        Mat element = Imgproc.getStructuringElement(elementType, new Size(2 * kernelSize + 1, 2 * kernelSize + 1),
+                new Point(kernelSize, kernelSize));
+
+        Imgproc.morphologyEx(matImgSrc, matImgDst, morphOpType, element);
+        BufferedImage img = toBufferedImage(matImgDst);
+        imgLabel.setIcon(new ImageIcon(img));
+        frame.repaint();
+    }
+
+    public static void main(String[] args) {
+        // Load the native OpenCV library
+        System.loadLibrary(Core.NATIVE_LIBRARY_NAME);
+
+        // Schedule a job for the event dispatch thread:
+        // creating and showing this application's GUI.
+        javax.swing.SwingUtilities.invokeLater(new Runnable() {
+            @Override
+            public void run() {
+                new MorphologyDemo2(args);
+            }
+        });
+    }
+}
diff --git a/samples/java/tutorial_code/highgui/trackbar/AddingImagesTrackbar.java b/samples/java/tutorial_code/highgui/trackbar/AddingImagesTrackbar.java
new file mode 100644
index 0000000000..5eda2198da
--- /dev/null
+++ b/samples/java/tutorial_code/highgui/trackbar/AddingImagesTrackbar.java
@@ -0,0 +1,119 @@
+import java.awt.BorderLayout;
+import java.awt.Container;
+import java.awt.Image;
+
+import javax.swing.BoxLayout;
+import javax.swing.ImageIcon;
+import javax.swing.JFrame;
+import javax.swing.JLabel;
+import javax.swing.JPanel;
+import javax.swing.JSlider;
+import javax.swing.event.ChangeEvent;
+import javax.swing.event.ChangeListener;
+
+import org.opencv.core.Core;
+import org.opencv.core.Mat;
+import org.opencv.highgui.HighGui;
+import org.opencv.imgcodecs.Imgcodecs;
+
+public class AddingImagesTrackbar {
+    private static final int ALPHA_SLIDER_MAX = 100;
+    private int alphaVal = 0;
+    private Mat matImgSrc1;
+    private Mat matImgSrc2;
+    private Mat matImgDst = new Mat();
+    private JFrame frame;
+    private JLabel imgLabel;
+
+    public AddingImagesTrackbar(String[] args) {
+        //! [load]
+        String imagePath1 = "../data/LinuxLogo.jpg";
+        String imagePath2 = "../data/WindowsLogo.jpg";
+        if (args.length > 1) {
+            imagePath1 = args[0];
+            imagePath2 = args[1];
+        }
+        matImgSrc1 = Imgcodecs.imread(imagePath1);
+        matImgSrc2 = Imgcodecs.imread(imagePath2);
+        //! [load]
+        if (matImgSrc1.empty()) {
+            System.out.println("Empty image: " + imagePath1);
+            System.exit(0);
+        }
+        if (matImgSrc2.empty()) {
+            System.out.println("Empty image: " + imagePath2);
+            System.exit(0);
+        }
+
+        //! [window]
+        // Create and set up the window.
+        frame = new JFrame("Linear Blend");
+        frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
+        // Set up the content pane.
+        Image img = HighGui.toBufferedImage(matImgSrc2);
+        addComponentsToPane(frame.getContentPane(), img);
+        // Use the content pane's default BorderLayout. No need for
+        // setLayout(new BorderLayout());
+        // Display the window.
+        frame.pack();
+        frame.setVisible(true);
+        //! [window]
+    }
+
+    private void addComponentsToPane(Container pane, Image img) {
+        if (!(pane.getLayout() instanceof BorderLayout)) {
+            pane.add(new JLabel("Container doesn't use BorderLayout!"));
+            return;
+        }
+
+        JPanel sliderPanel = new JPanel();
+        sliderPanel.setLayout(new BoxLayout(sliderPanel, BoxLayout.PAGE_AXIS));
+
+        //! [create_trackbar]
+        sliderPanel.add(new JLabel(String.format("Alpha x %d", ALPHA_SLIDER_MAX)));
+        JSlider slider = new JSlider(0, ALPHA_SLIDER_MAX, 0);
+        slider.setMajorTickSpacing(20);
+        slider.setMinorTickSpacing(5);
+        slider.setPaintTicks(true);
+        slider.setPaintLabels(true);
+        //! [create_trackbar]
+        //! [on_trackbar]
+        slider.addChangeListener(new ChangeListener() {
+            @Override
+            public void stateChanged(ChangeEvent e) {
+                JSlider source = (JSlider) e.getSource();
+                alphaVal = source.getValue();
+                update();
+            }
+        });
+        //! [on_trackbar]
+        sliderPanel.add(slider);
+
+        pane.add(sliderPanel, BorderLayout.PAGE_START);
+        imgLabel = new JLabel(new ImageIcon(img));
+        pane.add(imgLabel, BorderLayout.CENTER);
+    }
+
+    private void update() {
+        double alpha = alphaVal / (double) ALPHA_SLIDER_MAX;
+        double beta = 1.0 - alpha;
+        Core.addWeighted(matImgSrc1, alpha, matImgSrc2, beta, 0, matImgDst);
+        Image img = HighGui.toBufferedImage(matImgDst);
+        imgLabel.setIcon(new ImageIcon(img));
+        frame.repaint();
+    }
+
+    public static void main(String[] args) {
+        // Load the native OpenCV library
+        System.loadLibrary(Core.NATIVE_LIBRARY_NAME);
+
+        // Schedule a job for the event dispatch thread:
+        // creating and showing this application's GUI.
+        javax.swing.SwingUtilities.invokeLater(new Runnable() {
+            @Override
+            public void run() {
+                new AddingImagesTrackbar(args);
+            }
+        });
+    }
+}
diff --git a/samples/python/tutorial_code/highgui/trackbar/AddingImagesTrackbar.py b/samples/python/tutorial_code/highgui/trackbar/AddingImagesTrackbar.py
new file mode 100644
index 0000000000..2ccc978a8e
--- /dev/null
+++ b/samples/python/tutorial_code/highgui/trackbar/AddingImagesTrackbar.py
@@ -0,0 +1,48 @@
+from __future__ import print_function
+from __future__ import division
+import cv2 as cv
+import argparse
+
+alpha_slider_max = 100
+title_window = 'Linear Blend'
+
+## [on_trackbar]
+def on_trackbar(val):
+    alpha = val / alpha_slider_max
+    beta = ( 1.0 - alpha )
+    dst = cv.addWeighted(src1, alpha, src2, beta, 0.0)
+    cv.imshow(title_window, dst)
+## [on_trackbar]
+
+parser = argparse.ArgumentParser(description='Code for Adding a Trackbar to our applications tutorial.')
+parser.add_argument('--input1', help='Path to the first input image.', default='../data/LinuxLogo.jpg')
+parser.add_argument('--input2', help='Path to the second input image.', default='../data/WindowsLogo.jpg')
+args = parser.parse_args()
+
+## [load]
+# Read images ( both have to be of the same size and type )
+src1 = cv.imread(args.input1)
+src2 = cv.imread(args.input2)
+## [load]
+if src1 is None:
+    print('Could not open or find the image: ', args.input1)
+    exit(0)
+
+if src2 is None:
+    print('Could not open or find the image: ', args.input2)
+    exit(0)
+
+## [window]
+cv.namedWindow(title_window)
+## [window]
+
+## [create_trackbar]
+trackbar_name = 'Alpha x %d' % alpha_slider_max
+cv.createTrackbar(trackbar_name, title_window , 0, alpha_slider_max, on_trackbar)
+## [create_trackbar]
+
+# Show some stuff
+on_trackbar(0)
+
+# Wait until user press some key
+cv.waitKey()
diff --git a/samples/python/tutorial_code/imgProc/erosion_dilatation/morphology_1.py b/samples/python/tutorial_code/imgProc/erosion_dilatation/morphology_1.py
new file mode 100644
index 0000000000..cb3af732e8
--- /dev/null
+++ b/samples/python/tutorial_code/imgProc/erosion_dilatation/morphology_1.py
@@ -0,0 +1,63 @@
+from __future__ import print_function
+import cv2 as cv
+import numpy as np
+import argparse
+
+erosion_size = 0
+max_elem = 2
+max_kernel_size = 21
+title_trackbar_element_type = 'Element:\n 0: Rect \n 1: Cross \n 2: Ellipse'
+title_trackbar_kernel_size = 'Kernel size:\n 2n +1'
+title_erosion_window = 'Erosion Demo'
+title_dilatation_window = 'Dilation Demo'
+
+def erosion(val):
+    erosion_size = cv.getTrackbarPos(title_trackbar_kernel_size, title_erosion_window)
+    erosion_type = 0
+    val_type = cv.getTrackbarPos(title_trackbar_element_type, title_erosion_window)
+    if val_type == 0:
+        erosion_type = cv.MORPH_RECT
+    elif val_type == 1:
+        erosion_type = cv.MORPH_CROSS
+    elif val_type == 2:
+        erosion_type = cv.MORPH_ELLIPSE
+
+    element = cv.getStructuringElement(erosion_type, (2*erosion_size + 1, 2*erosion_size+1), (erosion_size, erosion_size))
+    erosion_dst = cv.erode(src, element)
+    cv.imshow(title_erosion_window, erosion_dst)
+
+def dilatation(val):
+    dilatation_size = cv.getTrackbarPos(title_trackbar_kernel_size, title_dilatation_window)
+    dilatation_type = 0
+    val_type = cv.getTrackbarPos(title_trackbar_element_type, title_dilatation_window)
+    if val_type == 0:
+        dilatation_type = cv.MORPH_RECT
+    elif val_type == 1:
+        dilatation_type = cv.MORPH_CROSS
+    elif val_type == 2:
+        dilatation_type = cv.MORPH_ELLIPSE
+
+    element = cv.getStructuringElement(dilatation_type, (2*dilatation_size + 1, 2*dilatation_size+1), (dilatation_size, dilatation_size))
+    dilatation_dst = cv.dilate(src, element)
+    cv.imshow(title_dilatation_window, dilatation_dst)
+
+parser = argparse.ArgumentParser(description='Code for Eroding and Dilating tutorial.')
+parser.add_argument('--input', help='Path to input image.', default='../data/LinuxLogo.jpg')
+args = parser.parse_args()
+
+src = cv.imread(args.input)
+if src is None:
+    print('Could not open or find the image: ', args.input)
+    exit(0)
+
+cv.namedWindow(title_erosion_window)
+cv.createTrackbar(title_trackbar_element_type, title_erosion_window , 0, max_elem, erosion)
+cv.createTrackbar(title_trackbar_kernel_size, title_erosion_window , 0, max_kernel_size, erosion)
+
+cv.namedWindow(title_dilatation_window)
+cv.createTrackbar(title_trackbar_element_type, title_dilatation_window , 0, max_elem, dilatation)
+cv.createTrackbar(title_trackbar_kernel_size, title_dilatation_window , 0, max_kernel_size, dilatation)
+
+erosion(0)
+dilatation(0)
+cv.waitKey()
diff --git a/samples/python/tutorial_code/imgProc/opening_closing_hats/morphology_2.py b/samples/python/tutorial_code/imgProc/opening_closing_hats/morphology_2.py
new file mode 100644
index 0000000000..5dfdece1b6
--- /dev/null
+++ b/samples/python/tutorial_code/imgProc/opening_closing_hats/morphology_2.py
@@ -0,0 +1,48 @@
+from __future__ import print_function
+import cv2 as cv
+import numpy as np
+import argparse
+
+morph_size = 0
+max_operator = 4
+max_elem = 2
+max_kernel_size = 21
+title_trackbar_operator_type = 'Operator:\n 0: Opening - 1: Closing  \n 2: Gradient - 3: Top Hat \n 4: Black Hat'
+title_trackbar_element_type = 'Element:\n 0: Rect - 1: Cross - 2: Ellipse'
+title_trackbar_kernel_size = 'Kernel size:\n 2n + 1'
+title_window = 'Morphology Transformations Demo'
+morph_op_dic = {0: cv.MORPH_OPEN, 1: cv.MORPH_CLOSE, 2: cv.MORPH_GRADIENT, 3: cv.MORPH_TOPHAT, 4: cv.MORPH_BLACKHAT}
+
+def morphology_operations(val):
+    morph_operator = cv.getTrackbarPos(title_trackbar_operator_type, title_window)
+    morph_size = cv.getTrackbarPos(title_trackbar_kernel_size, title_window)
+    morph_elem = 0
+    val_type = cv.getTrackbarPos(title_trackbar_element_type, title_window)
+    if val_type == 0:
+        morph_elem = cv.MORPH_RECT
+    elif val_type == 1:
+        morph_elem = cv.MORPH_CROSS
+    elif val_type == 2:
+        morph_elem = cv.MORPH_ELLIPSE
+
+    element = cv.getStructuringElement(morph_elem, (2*morph_size + 1, 2*morph_size+1), (morph_size, morph_size))
+    operation = morph_op_dic[morph_operator]
+    dst = cv.morphologyEx(src, operation, element)
+    cv.imshow(title_window, dst)
+
+parser = argparse.ArgumentParser(description='Code for More Morphology Transformations tutorial.')
+parser.add_argument('--input', help='Path to input image.', default='../data/LinuxLogo.jpg')
+args = parser.parse_args()
+
+src = cv.imread(args.input)
+if src is None:
+    print('Could not open or find the image: ', args.input)
+    exit(0)
+
+cv.namedWindow(title_window)
+cv.createTrackbar(title_trackbar_operator_type, title_window , 0, max_operator, morphology_operations)
+cv.createTrackbar(title_trackbar_element_type, title_window , 0, max_elem, morphology_operations)
+cv.createTrackbar(title_trackbar_kernel_size, title_window , 0, max_kernel_size, morphology_operations)
+
+morphology_operations(0)
+cv.waitKey()