Merge pull request #707 from ludv1x:dnn

9 years ago · 9a342b5187
parent e7b5c81b9d 942e920522
commit 9a342b5187
66 changed files with 5066 additions and 1307 deletions
--- a/modules/dnn/CMakeLists.txt
+++ b/modules/dnn/CMakeLists.txt
@ -17,15 +17,38 @@ ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4701)
 # Resolve libprotobuf dependency
 # ----------------------------------------------------------------------------
 include(cmake/OpenCVFindLibProtobuf.cmake)
-ocv_glob_module_sources(${PROTOBUF_SRCS} ${PROTOBUF_HDRS})
 ocv_source_group("Src\\protobuf" FILES ${PROTOBUF_SRCS} ${PROTOBUF_HDRS})
 ocv_module_include_directories(include ${PROTOBUF_INCLUDE_DIR})

+# ----------------------------------------------------------------------------
+# Try to find BLAS libraries
+# ----------------------------------------------------------------------------
+OCV_OPTION(${the_module}_WITH_BLAS "Use external BLAS library to speedup processing" OFF)
+include(cmake/OpenCVFindCBLAS.cmake)
+
+ocv_glob_module_sources(${PROTOBUF_SRCS} ${PROTOBUF_HDRS} ${CBLAS_H_PROXY_PATH})
 ocv_create_module(${PROTOBUF_LIBRARIES})
 ocv_add_samples()
 ocv_add_accuracy_tests()
 ocv_add_perf_tests()

+# ----------------------------------------------------------------------------
+# Link BLAS
+# ----------------------------------------------------------------------------
+if(${the_module}_WITH_BLAS AND HAVE_BLAS)
+    add_definitions(-DHAVE_CBLAS=1)
+    ocv_module_include_directories(${${the_module}_BLAS_INCLUDE_DIR})
+    ocv_add_dependencies(${the_module} ${${the_module}_BLAS_LIBRARIES})
+    target_link_libraries(${the_module} ${${the_module}_BLAS_LIBRARIES})
+
+    if(${the_module}_BLAS_BINARIES)
+        ocv_install_target(${the_module} EXPORT ${the_module}_BLAS_BINARIES
+                           RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} COMPONENT libs)
+    endif()
+else()
+    add_definitions(-DHAVE_CBLAS=0)
+endif()
+
 # ----------------------------------------------------------------------------
 # Download pre-trained models for complex testing on GoogLeNet and AlexNet
 # ----------------------------------------------------------------------------
--- a/modules/dnn/cmake/FindAtlas.cmake
+++ b/modules/dnn/cmake/FindAtlas.cmake
@ -0,0 +1,97 @@
+#COPYRIGHT
+#
+#All contributions by the University of California:
+#Copyright (c) 2014, 2015, The Regents of the University of California (Regents)
+#All rights reserved.
+#
+#All other contributions:
+#Copyright (c) 2014, 2015, the respective contributors
+#All rights reserved.
+#
+#Caffe uses a shared copyright model: each contributor holds copyright over
+#their contributions to Caffe. The project versioning records all such
+#contribution and copyright details. If a contributor wants to further mark
+#their specific copyright on a particular contribution, they should indicate
+#their copyright solely in the commit message of the change when it is
+#committed.
+#
+#LICENSE
+#
+#Redistribution and use in source and binary forms, with or without
+#modification, are permitted provided that the following conditions are met:
+#
+#1. Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#2. Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+#THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+#ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+#WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+#DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+#ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+#(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+#LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+#ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#CONTRIBUTION AGREEMENT
+#
+#By contributing to the BVLC/caffe repository through pull-request, comment,
+#or otherwise, the contributor releases their content to the
+#license and copyright terms herein.
+
+
+# Find the Atlas (and Lapack) libraries
+#
+# The following variables are optionally searched for defaults
+#  Atlas_ROOT_DIR:            Base directory where all Atlas components are found
+#
+# The following are set after configuration is done:
+#  Atlas_FOUND
+#  Atlas_INCLUDE_DIRS
+#  Atlas_LIBRARIES
+#  Atlas_LIBRARYRARY_DIRS
+
+set(Atlas_INCLUDE_SEARCH_PATHS
+  /usr/include/atlas
+  /usr/include/atlas-base
+  $ENV{Atlas_ROOT_DIR}
+  $ENV{Atlas_ROOT_DIR}/include
+)
+
+set(Atlas_LIB_SEARCH_PATHS
+  /usr/lib/atlas
+  /usr/lib/atlas-base
+  $ENV{Atlas_ROOT_DIR}
+  $ENV{Atlas_ROOT_DIR}/lib
+)
+
+find_path(Atlas_CBLAS_INCLUDE_DIR   NAMES cblas.h   PATHS ${Atlas_INCLUDE_SEARCH_PATHS})
+find_path(Atlas_CLAPACK_INCLUDE_DIR NAMES clapack.h PATHS ${Atlas_INCLUDE_SEARCH_PATHS})
+
+find_library(Atlas_CBLAS_LIBRARY NAMES  ptcblas_r ptcblas cblas_r cblas PATHS ${Atlas_LIB_SEARCH_PATHS})
+find_library(Atlas_BLAS_LIBRARY NAMES   atlas_r   atlas                 PATHS ${Atlas_LIB_SEARCH_PATHS})
+find_library(Atlas_LAPACK_LIBRARY NAMES alapack_r alapack lapack_atlas  PATHS ${Atlas_LIB_SEARCH_PATHS})
+
+set(LOOKED_FOR
+  Atlas_CBLAS_INCLUDE_DIR
+  Atlas_CLAPACK_INCLUDE_DIR
+
+  Atlas_CBLAS_LIBRARY
+  Atlas_BLAS_LIBRARY
+  Atlas_LAPACK_LIBRARY
+)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(Atlas DEFAULT_MSG ${LOOKED_FOR})
+
+if(ATLAS_FOUND)
+  set(Atlas_INCLUDE_DIR ${Atlas_CBLAS_INCLUDE_DIR} ${Atlas_CLAPACK_INCLUDE_DIR})
+  set(Atlas_LIBRARIES ${Atlas_LAPACK_LIBRARY} ${Atlas_CBLAS_LIBRARY} ${Atlas_BLAS_LIBRARY})
+  mark_as_advanced(${LOOKED_FOR})
+
+  message(STATUS "Found Atlas (include: ${Atlas_CBLAS_INCLUDE_DIR}, library: ${Atlas_BLAS_LIBRARY})")
+endif(ATLAS_FOUND)
--- a/modules/dnn/cmake/FindOpenBLAS.cmake
+++ b/modules/dnn/cmake/FindOpenBLAS.cmake
@ -0,0 +1,106 @@
+#COPYRIGHT
+#
+#All contributions by the University of California:
+#Copyright (c) 2014, 2015, The Regents of the University of California (Regents)
+#All rights reserved.
+#
+#All other contributions:
+#Copyright (c) 2014, 2015, the respective contributors
+#All rights reserved.
+#
+#Caffe uses a shared copyright model: each contributor holds copyright over
+#their contributions to Caffe. The project versioning records all such
+#contribution and copyright details. If a contributor wants to further mark
+#their specific copyright on a particular contribution, they should indicate
+#their copyright solely in the commit message of the change when it is
+#committed.
+#
+#LICENSE
+#
+#Redistribution and use in source and binary forms, with or without
+#modification, are permitted provided that the following conditions are met:
+#
+#1. Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#2. Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+#THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+#ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+#WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+#DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+#ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+#(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+#LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+#ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#CONTRIBUTION AGREEMENT
+#
+#By contributing to the BVLC/caffe repository through pull-request, comment,
+#or otherwise, the contributor releases their content to the
+#license and copyright terms herein.
+
+SET(Open_BLAS_INCLUDE_SEARCH_PATHS
+  /usr/include
+  /usr/include/openblas
+  /usr/include/openblas-base
+  /usr/local/include
+  /usr/local/include/openblas
+  /usr/local/include/openblas-base
+  /opt/OpenBLAS/include
+  $ENV{OpenBLAS_HOME}
+  $ENV{OpenBLAS_HOME}/include
+)
+
+SET(Open_BLAS_LIB_SEARCH_PATHS
+        /lib/
+        /lib/openblas-base
+        /lib64/
+        /usr/lib
+        /usr/lib/openblas-base
+        /usr/lib64
+        /usr/local/lib
+        /usr/local/lib64
+        /opt/OpenBLAS/lib
+        $ENV{OpenBLAS}cd
+        $ENV{OpenBLAS}/lib
+        $ENV{OpenBLAS_HOME}
+        $ENV{OpenBLAS_HOME}/lib
+ )
+
+FIND_PATH(OpenBLAS_INCLUDE_DIR NAMES cblas.h PATHS ${Open_BLAS_INCLUDE_SEARCH_PATHS})
+FIND_LIBRARY(OpenBLAS_LIB NAMES openblas PATHS ${Open_BLAS_LIB_SEARCH_PATHS})
+
+SET(OpenBLAS_FOUND ON)
+
+#    Check include files
+IF(NOT OpenBLAS_INCLUDE_DIR)
+    SET(OpenBLAS_FOUND OFF)
+    MESSAGE(STATUS "Could not find OpenBLAS include. Turning OpenBLAS_FOUND off")
+ENDIF()
+
+#    Check libraries
+IF(NOT OpenBLAS_LIB)
+    SET(OpenBLAS_FOUND OFF)
+    MESSAGE(STATUS "Could not find OpenBLAS lib. Turning OpenBLAS_FOUND off")
+ENDIF()
+
+IF (OpenBLAS_FOUND)
+  IF (NOT OpenBLAS_FIND_QUIETLY)
+    MESSAGE(STATUS "Found OpenBLAS libraries: ${OpenBLAS_LIB}")
+    MESSAGE(STATUS "Found OpenBLAS include: ${OpenBLAS_INCLUDE_DIR}")
+  ENDIF (NOT OpenBLAS_FIND_QUIETLY)
+ELSE (OpenBLAS_FOUND)
+  IF (OpenBLAS_FIND_REQUIRED)
+    MESSAGE(FATAL_ERROR "Could not find OpenBLAS")
+  ENDIF (OpenBLAS_FIND_REQUIRED)
+ENDIF (OpenBLAS_FOUND)
+
+MARK_AS_ADVANCED(
+    OpenBLAS_INCLUDE_DIR
+    OpenBLAS_LIB
+    OpenBLAS
+)
--- a/modules/dnn/cmake/OpenCVFindCBLAS.cmake
+++ b/modules/dnn/cmake/OpenCVFindCBLAS.cmake
@ -0,0 +1,60 @@
+macro(_find_file_in_dirs VAR NAME DIRS)
+    find_path(${VAR} ${NAME} ${DIRS} NO_DEFAULT_PATH)
+    set(${VAR} ${${VAR}}/${NAME})
+    unset(${VAR} CACHE)
+endmacro()
+
+if(${the_module}_WITH_BLAS)
+    set(_bp ${the_module}_BLAS) #prefix for blas variables
+    set(BLAS_CBLAS_H "cblas.h")
+    set(HAVE_BLAS "")
+
+    if(NOT HAVE_BLAS) #check custom BLAS from user input
+        if(${_bp}_INCLUDE_DIR AND ${_bp}_LIBRARIES AND ${_bp}_CBLAS_H)
+            set(HAVE_BLAS "Custom")
+        endif()
+    endif()
+    if(NOT HAVE_BLAS)
+        include(cmake/OpenCVFindMKL.cmake)
+        if(MKL_FOUND)
+            set(BLAS_INCLUDE_DIR    ${MKL_INCLUDE_DIRS})
+            set(BLAS_LIBRARIES      ${MKL_LIBRARIES}   )
+            set(BLAS_CBLAS_H        "mkl_cblas.h"      )
+            set(HAVE_BLAS "MKL")
+        endif()
+    endif()
+    if(NOT HAVE_BLAS)
+        include(cmake/FindOpenBLAS.cmake)
+        if(OpenBLAS_FOUND)
+            set(BLAS_INCLUDE_DIR    ${OpenBLAS_INCLUDE_DIR} )
+            set(BLAS_LIBRARIES      ${OpenBLAS_LIB}         )
+            set(HAVE_BLAS "OpenBLAS")
+        endif()
+    endif()
+    if(NOT HAVE_BLAS AND UNIX)
+        include(cmake/FindAtlas.cmake)
+        if(ATLAS_FOUND)
+            set(BLAS_INCLUDE_DIR    ${Atlas_INCLUDE_DIR})
+            set(BLAS_LIBRARIES      ${Atlas_LIBRARIES}  )
+            set(HAVE_BLAS "Atlas")
+        endif()
+    endif()
+
+    if(NOT HAVE_BLAS OR NOT (HAVE_BLAS STREQUAL "Custom"))
+        set(${_bp}_INCLUDE_DIR  ${BLAS_INCLUDE_DIR} CACHE PATH      "Path to BLAS include dir" FORCE)
+        set(${_bp}_CBLAS_H      ${BLAS_CBLAS_H}     CACHE STRING    "Alternative name of cblas.h" FORCE)
+        set(${_bp}_LIBRARIES    ${BLAS_LIBRARIES}   CACHE FILEPATH  "Path to BLAS libraries that will be linked with ${the_module} module" FORCE)
+        set(${_bp}_BINARIES     ${BLAS_BINARIES}    CACHE FILEPATH  "Path to BLAS binaries (.so, .dll) that will be installed with ${the_module} module" FORCE)
+    endif()
+
+    if(HAVE_BLAS) #adding proxy cblas.h header
+        _find_file_in_dirs(CBLAS_H_PATH ${${_bp}_CBLAS_H} ${${_bp}_INCLUDE_DIR})
+        if(NOT CBLAS_H_PATH)
+            message(WARNING "CBLAS header '${${_bp}_CBLAS_H}' not found into '${${_bp}_INCLUDE_DIR}'")
+        endif()
+
+        set(CBLAS_H_PROXY_PATH ${CMAKE_CURRENT_BINARY_DIR}/opencv_cblas.hpp)
+        set(_include_str "\#include \"${CBLAS_H_PATH}\"")
+        file(WRITE ${CBLAS_H_PROXY_PATH} ${_include_str})
+    endif()
+endif()
--- a/modules/dnn/cmake/OpenCVFindMKL.cmake
+++ b/modules/dnn/cmake/OpenCVFindMKL.cmake
@ -0,0 +1,123 @@
+#
+# The script to detect Intel(R) Math Kernel Library (MKL)
+# installation/package
+#
+# Parameters:
+# MKL_WITH_TBB
+#
+# On return this will define:
+#
+# HAVE_MKL          - True if Intel IPP found
+# MKL_ROOT_DIR      - root of IPP installation
+# MKL_INCLUDE_DIRS  - IPP include folder
+# MKL_LIBRARIES     - IPP libraries that are used by OpenCV
+#
+
+macro(mkl_fail)
+    set(HAVE_MKL OFF CACHE BOOL "True if MKL found")
+    set(MKL_ROOT_DIR ${MKL_ROOT_DIR} CACHE PATH "Path to MKL directory")
+    unset(MKL_INCLUDE_DIRS CACHE)
+    unset(MKL_LIBRARIES CACHE)
+endmacro()
+
+macro(get_mkl_version VERSION_FILE)
+    # read MKL version info from file
+    file(STRINGS ${VERSION_FILE} STR1 REGEX "__INTEL_MKL__")
+    file(STRINGS ${VERSION_FILE} STR2 REGEX "__INTEL_MKL_MINOR__")
+    file(STRINGS ${VERSION_FILE} STR3 REGEX "__INTEL_MKL_UPDATE__")
+    #file(STRINGS ${VERSION_FILE} STR4 REGEX "INTEL_MKL_VERSION")
+
+    # extract info and assign to variables
+    string(REGEX MATCHALL "[0-9]+" MKL_VERSION_MAJOR ${STR1})
+    string(REGEX MATCHALL "[0-9]+" MKL_VERSION_MINOR ${STR2})
+    string(REGEX MATCHALL "[0-9]+" MKL_VERSION_UPDATE ${STR3})
+    set(MKL_VERSION_STR "${MKL_VERSION_MAJOR}.${MKL_VERSION_MINOR}.${MKL_VERSION_UPDATE}" CACHE STRING "MKL version" FORCE)
+endmacro()
+
+
+if(NOT DEFINED MKL_USE_MULTITHREAD)
+    OCV_OPTION(MKL_WITH_TBB "Use MKL with TBB multithreading" OFF)#ON IF WITH_TBB)
+    OCV_OPTION(MKL_WITH_OPENMP "Use MKL with OpenMP multithreading" OFF)#ON IF WITH_OPENMP)
+endif()
+
+#check current MKL_ROOT_DIR
+if(NOT MKL_ROOT_DIR OR NOT EXISTS ${MKL_ROOT_DIR}/include/mkl.h)
+    set(MKLROOT_PATHS ${MKL_ROOT_DIR})
+    if(DEFINED $ENV{MKLROOT})
+        list(APPEND MKLROOT_PATHS $ENV{MKLROOT})
+    endif()
+    if(WIN32)
+        set(ProgramFilesx86 "ProgramFiles(x86)")
+        list(APPEND MKLROOT_PATHS $ENV{${ProgramFilesx86}}/IntelSWTools/compilers_and_libraries/windows/mkl)
+    endif()
+    if(UNIX)
+        list(APPEND MKLROOT_PATHS "/opt/intel/mkl")
+    endif()
+
+    find_path(MKL_ROOT_DIR include/mkl.h PATHS ${MKLROOT_PATHS})
+endif()
+
+if(NOT MKL_ROOT_DIR)
+    mkl_fail()
+    return()
+endif()
+
+set(MKL_INCLUDE_DIRS ${MKL_ROOT_DIR}/include)
+set(MKL_INCLUDE_HEADERS ${MKL_INCLUDE_DIRS}/mkl.h ${MKL_INCLUDE_DIRS}/mkl_version.h)
+
+#determine arch
+if(CMAKE_CXX_SIZEOF_DATA_PTR EQUAL 8)
+    set(MKL_X64 1)
+    set(MKL_ARCH "intel64")
+
+    include(CheckTypeSize)
+    CHECK_TYPE_SIZE(int _sizeof_int)
+    if (_sizeof_int EQUAL 4)
+        set(MKL_LP64 "lp64")
+    else()
+        set(MKL_LP64 "ilp64")
+    endif()
+else()
+    set(MKL_ARCH "ia32")
+endif()
+
+if(MSVC)
+    set(MKL_EXT ".lib")
+    set(MKL_PRE "")
+else()
+    set(MKL_EXT ".a")
+    set(MKL_PRE "lib")
+endif()
+
+set(MKL_LIB_DIR ${MKL_ROOT_DIR}/lib/${MKL_ARCH})
+set(MKL_LIBRARIES ${MKL_LIB_DIR}/${MKL_PRE}mkl_core${MKL_EXT} ${MKL_LIB_DIR}/${MKL_PRE}mkl_intel_${MKL_LP64}${MKL_EXT})
+
+if(MKL_WITH_TBB)
+    list(APPEND MKL_LIBRARIES ${MKL_LIB_DIR}/${MKL_PRE}mkl_tbb_thread${MKL_EXT})
+    list(APPEND MKL_LIBRARIES ${MKL_ROOT_DIR}/../tbb/lib/${MKL_ARCH}/tbb${MKL_EXT})
+elseif(MKL_WITH_OPENMP)
+    message(FATAL_ERROR "Multithreaded MKL is not supported yet")
+else()
+    list(APPEND MKL_LIBRARIES ${MKL_LIB_DIR}/${MKL_PRE}mkl_sequential${MKL_EXT})
+endif()
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(MKL MKL_INCLUDE_HEADERS MKL_LIBRARIES)
+
+if(MKL_FOUND)
+    get_mkl_version(${MKL_INCLUDE_DIRS}/mkl_version.h)
+    message(STATUS "Found MKL ${MKL_VERSION_STR} at: ${MKL_ROOT_DIR}")
+
+    set(HAVE_MKL ON CACHE BOOL "True if MKL found")
+    set(MKL_ROOT_DIR ${MKL_ROOT_DIR} CACHE PATH "Path to MKL directory")
+    set(MKL_INCLUDE_DIRS ${MKL_INCLUDE_DIRS} CACHE PATH "Path to MKL include directory")
+    if(NOT UNIX)
+        set(MKL_LIBRARIES ${MKL_LIBRARIES} CACHE FILEPATH "MKL libarries")
+    else()
+        #it's ugly but helps to avoid cyclic lib problem
+        set(MKL_LIBRARIES ${MKL_LIBRARIES} ${MKL_LIBRARIES} ${MKL_LIBRARIES} "-lpthread" "-lm" "-ldl")
+        set(MKL_LIBRARIES ${MKL_LIBRARIES} CACHE STRING "MKL libarries")
+    endif()
+else()
+
+endif()
--- a/modules/dnn/include/opencv2/dnn/all_layers.hpp
+++ b/modules/dnn/include/opencv2/dnn/all_layers.hpp
@ -0,0 +1,371 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_DNN_DNN_ALL_LAYERS_HPP__
+#define __OPENCV_DNN_DNN_ALL_LAYERS_HPP__
+#include <opencv2/dnn.hpp>
+
+namespace cv
+{
+namespace dnn
+{
+//! @addtogroup dnn
+//! @{
+
+/** @defgroup dnnLayerList Partial List of Implemented Layers
+  @{
+  This subsection of dnn module contains information about bult-in layers and their descriptions.
+
+  Classes listed here, in fact, provides C++ API for creating intances of bult-in layers.
+  In addition to this way of layers instantiation, there is a more common factory API (see @ref dnnLayerFactory), it allows to create layers dynamically (by name) and register new ones.
+  You can use both API, but factory API is less convinient for native C++ programming and basically designed for use inside importers (see @ref Importer, @ref createCaffeImporter(), @ref createTorchImporter()).
+
+  Bult-in layers partially reproduce functionality of corresponding Caffe and Torch7 layers.
+  In partuclar, the following layers and Caffe @ref Importer were tested to reproduce <a href="http://caffe.berkeleyvision.org/tutorial/layers.html">Caffe</a> functionality:
+  - Convolution
+  - Deconvolution
+  - Pooling
+  - InnerProduct
+  - TanH, ReLU, Sigmoid, BNLL, Power, AbsVal
+  - Softmax
+  - Reshape, Flatten, Slice, Split
+  - LRN
+  - MVN
+  - Dropout (since it does nothing on forward pass -))
+*/
+
+    //! LSTM recurrent layer
+    class CV_EXPORTS_W LSTMLayer : public Layer
+    {
+    public:
+        /** Creates instance of LSTM layer */
+        static Ptr<LSTMLayer> create();
+
+        /** Set trained weights for LSTM layer.
+        LSTM behavior on each step is defined by current input, previous output, previous cell state and learned weights.
+
+        Let @f$x_t@f$ be current input, @f$h_t@f$ be current output, @f$c_t@f$ be current state.
+        Than current output and current cell state is computed as follows:
+        @f{eqnarray*}{
+        h_t &= o_t \odot tanh(c_t),               \\
+        c_t &= f_t \odot c_{t-1} + i_t \odot g_t, \\
+        @f}
+        where @f$\odot@f$ is per-element multiply operation and @f$i_t, f_t, o_t, g_t@f$ is internal gates that are computed using learned wights.
+
+        Gates are computed as follows:
+        @f{eqnarray*}{
+        i_t &= sigmoid&(W_{xi} x_t + W_{hi} h_{t-1} + b_i), \\
+        f_t &= sigmoid&(W_{xf} x_t + W_{hf} h_{t-1} + b_f), \\
+        o_t &= sigmoid&(W_{xo} x_t + W_{ho} h_{t-1} + b_o), \\
+        g_t &= tanh   &(W_{xg} x_t + W_{hg} h_{t-1} + b_g), \\
+        @f}
+        where @f$W_{x?}@f$, @f$W_{h?}@f$ and @f$b_{?}@f$ are learned weights represented as matrices:
+        @f$W_{x?} \in R^{N_h \times N_x}@f$, @f$W_{h?} \in R^{N_h \times N_h}@f$, @f$b_? \in R^{N_h}@f$.
+
+        For simplicity and performance purposes we use @f$ W_x = [W_{xi}; W_{xf}; W_{xo}, W_{xg}] @f$
+        (i.e. @f$W_x@f$ is vertical contacentaion of @f$ W_{x?} @f$), @f$ W_x \in R^{4N_h \times N_x} @f$.
+        The same for @f$ W_h = [W_{hi}; W_{hf}; W_{ho}, W_{hg}], W_h \in R^{4N_h \times N_h} @f$
+        and for @f$ b = [b_i; b_f, b_o, b_g]@f$, @f$b \in R^{4N_h} @f$.
+
+        @param Wh is matrix defining how previous output is transformed to internal gates (i.e. according to abovemtioned notation is @f$ W_h @f$)
+        @param Wx is matrix defining how current input is transformed to internal gates (i.e. according to abovemtioned notation is @f$ W_x @f$)
+        @param b  is bias vector (i.e. according to abovemtioned notation is @f$ b @f$)
+        */
+        virtual void setWeights(const Blob &Wh, const Blob &Wx, const Blob &b) = 0;
+
+        /** @brief Specifies shape of output blob which will be [[`T`], `N`] + @p outTailShape.
+          * @details If this parameter is empty or unset then @p outTailShape = [`Wh`.size(0)] will be used,
+          * where `Wh` is parameter from setWeights().
+          */
+        virtual void setOutShape(const BlobShape &outTailShape = BlobShape::empty()) = 0;
+
+        /** @brief Set @f$ h_{t-1} @f$ value that will be used in next forward() calls.
+          * @details By-default @f$ h_{t-1} @f$ is inited by zeros and updated after each forward() call.
+          */
+        virtual void setH(const Blob &H) = 0;
+        /** @brief Returns current @f$ h_{t-1} @f$ value (deep copy). */
+        virtual Blob getH() const = 0;
+
+        /** @brief Set @f$ c_{t-1} @f$ value that will be used in next forward() calls.
+          * @details By-default @f$ c_{t-1} @f$ is inited by zeros and updated after each forward() call.
+          */
+        virtual void setC(const Blob &C) = 0;
+        /** @brief Returns current @f$ c_{t-1} @f$ value (deep copy). */
+        virtual Blob getC() const = 0;
+
+        /** @brief Specifies either interpet first dimension of input blob as timestamp dimenion either as sample.
+          *
+          * If flag is set to true then shape of input blob will be interpeted as [`T`, `N`, `[data dims]`] where `T` specifies number of timpestamps, `N` is number of independent streams.
+          * In this case each forward() call will iterate through `T` timestamps and update layer's state `T` times.
+          *
+          * If flag is set to false then shape of input blob will be interpeted as [`N`, `[data dims]`].
+          * In this case each forward() call will make one iteration and produce one timestamp with shape [`N`, `[out dims]`].
+          */
+        virtual void setUseTimstampsDim(bool use = true) = 0;
+
+        /** @brief If this flag is set to true then layer will produce @f$ c_t @f$ as second output.
+         * @details Shape of the second output is the same as first output.
+         */
+        virtual void setProduceCellOutput(bool produce = false) = 0;
+
+        /** In common case it use single input with @f$x_t@f$ values to compute output(s) @f$h_t@f$ (and @f$c_t@f$).
+         * @param input should contain packed values @f$x_t@f$
+         * @param output contains computed outputs: @f$h_t@f$ (and @f$c_t@f$ if setProduceCellOutput() flag was set to true).
+         *
+         * If setUseTimstampsDim() is set to true then @p input[0] should has at least two dimensions with the following shape: [`T`, `N`, `[data dims]`],
+         * where `T` specifies number of timpestamps, `N` is number of independent streams (i.e. @f$ x_{t_0 + t}^{stream} @f$ is stored inside @p input[0][t, stream, ...]).
+         *
+         * If setUseTimstampsDim() is set to fase then @p input[0] should contain single timestamp, its shape should has form [`N`, `[data dims]`] with at least one dimension.
+         * (i.e. @f$ x_{t}^{stream} @f$ is stored inside @p input[0][stream, ...]).
+        */
+        void forward(std::vector<Blob*> &input, std::vector<Blob> &output);
+
+        int inputNameToIndex(String inputName);
+
+        int outputNameToIndex(String outputName);
+    };
+
+    //! Classical recurrent layer
+    class CV_EXPORTS_W RNNLayer : public Layer
+    {
+    public:
+        /** Creates instance of RNNLayer */
+        static Ptr<RNNLayer> create();
+
+        /** Setups learned weights.
+
+        Recurrent-layer behavior on each step is defined by current input @f$ x_t @f$, previous state @f$ h_t @f$ and learned weights as follows:
+        @f{eqnarray*}{
+        h_t &= tanh&(W_{hh} h_{t-1} + W_{xh} x_t + b_h),  \\
+        o_t &= tanh&(W_{ho} h_t + b_o),
+        @f}
+
+        @param Wxh is @f$ W_{xh} @f$ matrix
+        @param bh  is @f$ b_{h}  @f$ vector
+        @param Whh is @f$ W_{hh} @f$ matrix
+        @param Who is @f$ W_{xo} @f$ matrix
+        @param bo  is @f$ b_{o}  @f$ vector
+        */
+        virtual void setWeights(const Blob &Wxh, const Blob &bh, const Blob &Whh, const Blob &Who, const Blob &bo) = 0;
+
+        /** @brief If this flag is set to true then layer will produce @f$ h_t @f$ as second output.
+         * @details Shape of the second output is the same as first output.
+         */
+        virtual void setProduceHiddenOutput(bool produce = false) = 0;
+
+        /** Accepts two inputs @f$x_t@f$ and @f$h_{t-1}@f$ and compute two outputs @f$o_t@f$ and @f$h_t@f$.
+
+        @param input should contain packed input @f$x_t@f$.
+        @param output should contain output @f$o_t@f$ (and @f$h_t@f$ if setProduceHiddenOutput() is set to true).
+
+        @p input[0] should have shape [`T`, `N`, `data_dims`] where `T` and `N` is number of timestamps and number of independent samples of @f$x_t@f$ respectively.
+
+        @p output[0] will have shape [`T`, `N`, @f$N_o@f$], where @f$N_o@f$ is number of rows in @f$ W_{xo} @f$ matrix.
+
+        If setProduceHiddenOutput() is set to true then @p output[1] will contain a Blob with shape [`T`, `N`, @f$N_h@f$], where @f$N_h@f$ is number of rows in @f$ W_{hh} @f$ matrix.
+        */
+        void forward(std::vector<Blob*> &input, std::vector<Blob> &output);
+    };
+
+    class CV_EXPORTS_W BaseConvolutionLayer : public Layer
+    {
+    public:
+
+        Size kernel, stride, pad;
+    };
+
+    class CV_EXPORTS_W ConvolutionLayer : public BaseConvolutionLayer
+    {
+    public:
+
+        static Ptr<BaseConvolutionLayer> create(Size kernel = Size(3, 3), Size stride = Size(1, 1), Size pad = Size(0, 0));
+    };
+
+    class CV_EXPORTS_W DeconvolutionLayer : public BaseConvolutionLayer
+    {
+    public:
+
+        static Ptr<BaseConvolutionLayer> create(Size kernel = Size(3, 3), Size stride = Size(1, 1), Size pad = Size(0, 0));
+    };
+
+    class CV_EXPORTS_W LRNLayer : public Layer
+    {
+    public:
+
+        enum Type
+        {
+            CHANNEL_NRM,
+            SPATIAL_NRM
+        };
+        int type;
+
+        int size;
+        double alpha, beta;
+
+        static Ptr<LRNLayer> create(int type = CHANNEL_NRM, int size = 5, double alpha = 1, double beta = 0.75);
+    };
+
+    class CV_EXPORTS_W PoolingLayer : public Layer
+    {
+    public:
+
+        enum Type
+        {
+            MAX,
+            AVE,
+            STOCHASTIC
+        };
+
+        int type;
+        Size kernel, stride, pad;
+
+        static Ptr<PoolingLayer> create(int type = MAX, Size kernel = Size(2, 2), Size stride = Size(1, 1), Size pad = Size(0, 0));
+    };
+
+    class CV_EXPORTS_W SoftmaxLayer : public Layer
+    {
+    public:
+
+        static Ptr<SoftmaxLayer> create(int axis = 1);
+    };
+
+    class CV_EXPORTS_W InnerProductLayer : public Layer
+    {
+    public:
+        int axis;
+
+        static Ptr<InnerProductLayer> create(int axis = 1);
+    };
+
+    class CV_EXPORTS_W MVNLayer : public Layer
+    {
+    public:
+        double eps;
+        bool normVariance, acrossChannels;
+
+        static Ptr<MVNLayer> create(bool normVariance = true, bool acrossChannels = false, double eps = 1e-9);
+    };
+
+    /* Reshaping */
+
+    class CV_EXPORTS_W ReshapeLayer : public Layer
+    {
+    public:
+        BlobShape newShapeDesc;
+        Range newShapeRange;
+
+        static Ptr<ReshapeLayer> create(const BlobShape &newShape, Range applyingRange = Range::all());
+    };
+
+    class CV_EXPORTS_W ConcatLayer : public Layer
+    {
+    public:
+        int axis;
+
+        static Ptr<ConcatLayer> create(int axis = 1);
+    };
+
+    class CV_EXPORTS_W SplitLayer : public Layer
+    {
+    public:
+        int outputsCount; //!< Number of copies that will be produced (is ignored when negative).
+
+        static Ptr<SplitLayer> create(int outputsCount = -1);
+    };
+
+    class CV_EXPORTS_W SliceLayer : public Layer
+    {
+    public:
+        int axis;
+        std::vector<int> sliceIndices;
+
+        static Ptr<SliceLayer> create(int axis);
+        static Ptr<SliceLayer> create(int axis, const std::vector<int> &sliceIndices);
+    };
+
+    /* Activations */
+
+    class CV_EXPORTS_W ReLULayer : public Layer
+    {
+    public:
+        double negativeSlope;
+
+        static Ptr<ReLULayer> create(double negativeSlope = 0);
+    };
+
+    class CV_EXPORTS_W TanHLayer : public Layer
+    {
+    public:
+        static Ptr<TanHLayer> create();
+    };
+
+    class CV_EXPORTS_W SigmoidLayer : public Layer
+    {
+    public:
+        static Ptr<SigmoidLayer> create();
+    };
+
+    class CV_EXPORTS_W BNLLLayer : public Layer
+    {
+    public:
+        static Ptr<BNLLLayer> create();
+    };
+
+    class CV_EXPORTS_W AbsLayer : public Layer
+    {
+    public:
+        static Ptr<AbsLayer> create();
+    };
+
+    class CV_EXPORTS_W PowerLayer : public Layer
+    {
+    public:
+        double power, scale, shift;
+
+        static Ptr<PowerLayer> create(double power = 1, double scale = 1, double shift = 0);
+    };
+
+//! @}
+//! @}
+
+}
+}
+#endif
--- a/modules/dnn/include/opencv2/dnn/blob.hpp
+++ b/modules/dnn/include/opencv2/dnn/blob.hpp
@ -44,6 +44,7 @@
 #include <opencv2/core.hpp>
 #include <vector>
 #include <ostream>
+#include <iostream>

 namespace cv
 {
@ -55,13 +56,21 @@ namespace dnn
    /** @brief Lightweight class for storing and processing a shape of blob (or anything else). */
    struct BlobShape
    {
-        explicit BlobShape(int ndims = 4, int fill = 1);    //!< Creates n-dim shape and fill its by @p fill
+        BlobShape();                                        //!< Creates [1, 1, 1, 1] shape @todo Make more clearer behavior.
+        explicit BlobShape(int s0);                         //!< Creates 1-dim shape [@p s0]
+        BlobShape(int s0, int s1);                          //!< @overload
+        BlobShape(int s0, int s1, int s2);                  //!< @overload
        BlobShape(int num, int cn, int rows, int cols);     //!< Creates 4-dim shape [@p num, @p cn, @p rows, @p cols]
-        BlobShape(int ndims, const int *sizes);             //!< Creates n-dim shape from the @p sizes array
+
+        //! Creates n-dim shape from the @p sizes array; if @p sizes is NULL then shape will contain unspecified data
+        BlobShape(int ndims, const int *sizes);
        BlobShape(const std::vector<int> &sizes);           //!< Creates n-dim shape from the @p sizes vector
        template<int n>
        BlobShape(const Vec<int, n> &shape);                //!< Creates n-dim shape from @ref cv::Vec

+        //! Creates n-dim shape and fill its by @p fill
+        static BlobShape all(int ndims, int fill = 1);
+
        /** @brief Returns number of dimensions. */
        int dims() const;

@ -88,16 +97,41 @@ namespace dnn
         */
        int xsize(int axis) const;

+        /** @brief Converts @p axis index to canonical format (where 0 <= @p axis < dims()). */
+        int canonicalAxis(int axis) const;
+
        /** @brief Returns the product of all sizes of axes. */
-        ptrdiff_t total();
+        ptrdiff_t total() const;
+
+        /** @brief Computes the product of sizes of axes among the specified axes range [@p startAxis; @p endAxis).
+         * @details Negative axis indexing can be used. @sa Blob::total(int,int)
+         */
+        ptrdiff_t total(int startAxis, int endAxis = INT_MAX) const;
+
+        /** @brief Constructs new shape from axes in range [@p startAxis; @p endAxis).
+         * @details Negative axis indexing can be used. @sa Blob::total(int,int)
+         */
+        BlobShape slice(int startAxis, int endAxis = INT_MAX) const;

        /** @brief Returns pointer to the first element of continuous size array. */
        const int *ptr() const;
+        /** @overload */
+        int *ptr();
+
+        bool equal(const BlobShape &other) const;       //!< Checks equality of two shapes.
+        bool operator== (const BlobShape &r) const;     //!< @sa equal()

-        /** @brief Checks equality of two shapes. */
-        bool equal(const BlobShape &other) const;
+        BlobShape operator+ (const BlobShape &r) const; //!< Contacenates two shapes.

-        bool operator== (const BlobShape &r) const;
+        static BlobShape like(const Mat &m);    //!< Returns shape of passed Mat.
+        static BlobShape like(const UMat &m);   //!< Returns shape of passed UMat.
+
+        static BlobShape empty();               //!< Returns empty shape [].
+        bool isEmpty() const;                   //!< Returns true if shape is empty (i.e []).
+
+#ifdef CV_CXX_MOVE_SEMANTICS
+        //TBD
+#endif

    private:
        cv::AutoBuffer<int,4> sz;
@ -109,34 +143,57 @@ namespace dnn
     * The class is realized as a wrapper over @ref cv::Mat and @ref cv::UMat.
     * It will support methods for switching and logical synchronization between CPU and GPU.
    */
-    class CV_EXPORTS Blob
+    class CV_EXPORTS_W Blob
    {
    public:
-        explicit Blob();
+        Blob();

        /** @brief Constructs blob with specified @p shape and @p type. */
-        explicit Blob(const BlobShape &shape, int type = CV_32F);
+        explicit Blob(const BlobShape &shape, int type = CV_32F, int allocFlags = ALLOC_MAT);
+
+        /** @brief Constructs Blob from existing Mat or UMat. */
+        Blob(InputArray data);

        /** @brief Constucts 4-dimensional blob (so-called batch) from image or array of images.
-         * @param image 2-dimensional multi-channel or 3-dimensional single-channel image (or array of images)
-         * @param dstCn specify size of second axis of ouptut blob
-        */
-        explicit Blob(InputArray image, int dstCn = -1);
+         * @param image 2-dimensional multi-channel or 3-dimensional single-channel image (or array of such images)
+         * @param dstCn specifies size of second axis of ouptut blob
+         */
+        static Blob fromImages(InputArray image, int dstCn = -1);
+
+        /** @brief Works like Blob::fromImages() but in-place. */
+        void batchFromImages(InputArray image, int dstCn = -1);

        /** @brief Creates blob with specified @p shape and @p type. */
-        void create(const BlobShape &shape, int type = CV_32F);
+        void create(const BlobShape &shape, int type = CV_32F, int allocFlags = ALLOC_MAT);

-        /** @brief Creates blob from cv::Mat or cv::UMat without copying the data */
+        /** @brief Creates blob from Mat or UMat without copying the data.
+          * @details If in is Mat then Mat data is populated, otherwise - UMat.
+          */
        void fill(InputArray in);
+
        /** @brief Creates blob from user data.
         *  @details If @p deepCopy is false then CPU data will not be allocated.
         */
        void fill(const BlobShape &shape, int type, void *data, bool deepCopy = true);

-        Mat& matRef();                      //!< Returns reference to cv::Mat, containing blob data.
-        const Mat& matRefConst() const;     //!< Returns reference to cv::Mat, containing blob data, for read-only purposes.
-        UMat &umatRef();                    //!< Returns reference to cv::UMat, containing blob data (not implemented yet).
-        const UMat &umatRefConst() const;   //!< Returns reference to cv::UMat, containing blob data, for read-only purposes (not implemented yet).
+        /** @brief Sets @p value to the last used data (if @p allocFlags = -1).
+         * @details If @p allocFlags != -1 then destination data (Mat or UMat) is determined by flags from AllocFlag enum like in create().
+         */
+        void setTo(InputArray value, int allocFlags = -1);
+
+        Mat& matRef(bool writeOnly = true);     //!< Returns reference to cv::Mat, containing blob data.
+        const Mat& matRefConst() const;         //!< Returns reference to cv::Mat, containing blob data, for read-only purposes.
+        UMat &umatRef(bool writeOnly = true);   //!< Returns reference to cv::UMat, containing blob data.
+        const UMat &umatRefConst() const;       //!< Returns reference to cv::UMat, containing blob data, for read-only purposes.
+
+        template<typename XMat>
+        XMat &getRef(bool writeOnly = true);
+        template<typename XMat>
+        const XMat &getRefConst() const;
+
+        void updateMat(bool syncData = true) const;     //!< Actualizes data stored inside Mat of Blob; if @p syncData is false then only shape will be actualized.
+        void updateUMat(bool syncData = true) const;    //!< Actualizes data stored inside Mat of Blob; if @p syncData is false then only shape will be actualized.
+        void sync() const;                              //!< Updates Mat and UMat of Blob.

        /** @brief Returns number of blob dimensions. */
        int dims() const;
@ -163,7 +220,7 @@ namespace dnn
         */
        size_t total(int startAxis = 0, int endAxis = INT_MAX) const;

-        /** @brief Converts @p axis index to canonical format (where 0 <= axis < dims()). */
+        /** @brief Converts @p axis index to canonical format (where 0 <= @p axis < dims()). */
        int canonicalAxis(int axis) const;

        /** @brief Returns shape of the blob. */
@ -172,11 +229,6 @@ namespace dnn
        /** @brief Checks equality of two blobs shapes. */
        bool equalShape(const Blob &other) const;

-        /** @brief Returns slice of first two dimensions.
-         *  @details The behaviour is similar to the following numpy code: blob[n, cn, ...]
-         */
-        Mat getPlane(int n, int cn);
-
        /* Shape getters of 4-dimensional blobs. */
        int cols() const;       //!< Returns size of the fourth axis blob.
        int rows() const;       //!< Returns size of the thrid  axis blob.
@ -204,12 +256,18 @@ namespace dnn
         */
        uchar *ptr(int n = 0, int cn = 0, int row = 0, int col = 0);
        /** @overload */
-        template<typename TFloat>
-        TFloat *ptr(int n = 0, int cn = 0, int row = 0, int col = 0);
+        template<typename Type>
+        Type *ptr(int n = 0, int cn = 0, int row = 0, int col = 0);
        /** @overload ptr<float>() */
        float *ptrf(int n = 0, int cn = 0, int row = 0, int col = 0);
        //TODO: add const ptr methods

+        /** @brief Returns slice of first two dimensions.
+         *  @details The behaviour is similar to the following numpy code: blob[n, cn, ...]
+         *  @todo Method will be removed. Use slice() from shape_utils.hpp.
+         */
+        Mat getPlane(int n, int cn);
+
        /** @brief Shares data from other @p blob.
         * @returns *this
         */
@ -220,13 +278,52 @@ namespace dnn
         */
        Blob &reshape(const BlobShape &shape);

-        /** @brief Returns type of the blob. */
-        int type() const;
+        /** @brief Changes shape of the blob without copying the data.
+         * @returns shallow copy of original blob with new shape.
+         */
+        Blob reshaped(const BlobShape &newShape) const;
+
+        int type() const;       //!< Returns type of the blob.
+        int elemSize() const;   //!< Returns size of single element in bytes.
+        int getState() const;   //!< Returns current state of the blob, @see DataState.

    private:
        const int *sizes() const;

+#   define CV_DNN_UMAT //DBG
+#ifdef HAVE_OPENCL
+#   define CV_DNN_UMAT
+#endif
+
+#ifdef CV_DNN_UMAT
+#   define CV_DNN_UMAT_ONLY(expr) (expr)
+#else
+#   define CV_DNN_UMAT_ONLY(expr)
+#endif
+
+#ifndef CV_DNN_UMAT
        Mat m;
+#else
+        mutable Mat m;
+        mutable UMat um;
+        mutable uchar state;
+#endif
+
+public:
+        enum DataState
+        {
+            UNINITIALIZED,
+            HEAD_AT_MAT,
+            HEAD_AT_UMAT,
+            SYNCED
+        };
+
+        enum AllocFlag
+        {
+            ALLOC_MAT  = 1,
+            ALLOC_UMAT = 2,
+            ALLOC_BOTH = 3
+        };
    };

 //! @}
--- a/modules/dnn/include/opencv2/dnn/blob.inl.hpp
+++ b/modules/dnn/include/opencv2/dnn/blob.inl.hpp
@ -48,20 +48,50 @@ namespace cv
 namespace dnn
 {

-inline BlobShape::BlobShape(int ndims, int fill) : sz( (size_t)std::max(ndims, 0) )
+inline BlobShape::BlobShape()
+{
+    sz.allocate(4);
+    for (size_t i = 0; i < sz.size(); i++)
+        sz[i] = 1;
+}
+
+inline BlobShape BlobShape::all(int ndims, int fill)
 {
    CV_Assert(ndims >= 0);
+    BlobShape res;
+    res.sz.allocate(ndims);
    for (int i = 0; i < ndims; i++)
-        sz[i] = fill;
+        res.sz[i] = fill;
+    return res;
 }

 inline BlobShape::BlobShape(int ndims, const int *sizes) : sz( (size_t)std::max(ndims, 0) )
 {
    CV_Assert(ndims >= 0);
+    if (!sizes)
+        return;
    for (int i = 0; i < ndims; i++)
        sz[i] = sizes[i];
 }

+inline BlobShape::BlobShape(int s0) : sz(1)
+{
+    sz[0] = s0;
+}
+
+inline BlobShape::BlobShape(int s0, int s1) : sz(2)
+{
+    sz[0] = s0;
+    sz[1] = s1;
+}
+
+inline BlobShape::BlobShape(int s0, int s1, int s2) : sz(3)
+{
+    sz[0] = s0;
+    sz[1] = s1;
+    sz[2] = s2;
+}
+
 inline BlobShape::BlobShape(int num, int cn, int rows, int cols) : sz(4)
 {
    sz[0] = num;
@ -120,7 +150,13 @@ inline int &BlobShape::operator[] (int axis)
    return sz[(axis < 0) ? axis + dims() : axis];
 }

-inline ptrdiff_t BlobShape::total()
+inline int BlobShape::canonicalAxis(int axis) const
+{
+    CV_Assert(-dims() <= axis && axis < dims());
+    return (axis < 0) ? axis + dims() : axis;
+}
+
+inline ptrdiff_t BlobShape::total() const
 {
    if (dims() == 0)
        return 0;
@ -131,11 +167,52 @@ inline ptrdiff_t BlobShape::total()
    return res;
 }

+inline ptrdiff_t BlobShape::total(int startAxis, int endAxis) const
+{
+    if (isEmpty())
+        return 0;
+
+    if (endAxis == INT_MAX)
+        endAxis = dims();
+    else if (endAxis < 0)
+        endAxis += dims();
+    startAxis = (startAxis < 0) ? startAxis + dims() : startAxis;
+    CV_Assert(0 <= startAxis && startAxis <= endAxis && endAxis <= dims());
+
+    ptrdiff_t res = 1;
+    for (int i = startAxis; i < endAxis; i++)
+        res *= sz[i];
+    return res;
+}
+
+inline BlobShape BlobShape::slice(int startAxis, int endAxis) const
+{
+    if (isEmpty())
+        return BlobShape::empty();
+
+    if (endAxis == INT_MAX)
+        endAxis = dims();
+    else if (endAxis < 0)
+        endAxis += dims();
+    startAxis = (startAxis < 0) ? startAxis + dims() : startAxis;
+    CV_Assert(0 <= startAxis && startAxis <= endAxis && endAxis <= dims());
+
+    BlobShape res(endAxis - startAxis, (const int*)NULL);
+    for (int i = startAxis; i < endAxis; i++)
+        res[i - startAxis] = sz[i];
+    return res;
+}
+
 inline const int *BlobShape::ptr() const
 {
    return sz;
 }

+inline int *BlobShape::ptr()
+{
+    return sz;
+}
+
 inline bool BlobShape::equal(const BlobShape &other) const
 {
    if (this->dims() != other.dims())
@ -155,19 +232,83 @@ inline bool BlobShape::operator==(const BlobShape &r) const
    return this->equal(r);
 }

+inline BlobShape BlobShape::like(const Mat &m)
+{
+    return BlobShape(m.dims, (const int*)m.size);
+}
+
+inline BlobShape BlobShape::like(const UMat &m)
+{
+    return BlobShape(m.dims, (const int*)m.size);
+}
+
+inline BlobShape BlobShape::empty()
+{
+    return BlobShape(0, (const int*)NULL);
+}
+
+inline bool BlobShape::isEmpty() const
+{
+    return dims() == 0;
+}
+
+inline BlobShape BlobShape::operator+(const BlobShape &r) const
+{
+    BlobShape newShape(this->dims() + r.dims(), (int*)NULL);
+    for (int i = 0; i < this->dims(); i++)
+        newShape[i] = (*this)[i];
+    for (int i = 0; i < r.dims(); i++)
+        newShape[this->dims() + i] = r[i];
+    return newShape;
+}
+
 CV_EXPORTS std::ostream &operator<< (std::ostream &stream, const BlobShape &shape);

 /////////////////////////////////////////////////////////////////////

-inline int Blob::canonicalAxis(int axis) const
+#ifndef CV_DNN_UMAT
+#   define CV_DNN_SWITCH_MU(cpu_expr, gpu_expr) (cpu_expr)
+#else
+#   define CV_DNN_SWITCH_MU(cpu_expr, gpu_expr) ((state == HEAD_AT_UMAT) ? (gpu_expr) : (cpu_expr))
+#endif
+
+
+inline int Blob::dims() const
 {
-    CV_Assert(-dims() <= axis && axis < dims());
-    return (axis < 0) ? axis + dims() : axis;
+    return CV_DNN_SWITCH_MU(m.dims, um.dims);
 }

-inline int Blob::dims() const
+inline const int * Blob::sizes() const
 {
-    return m.dims;
+    return CV_DNN_SWITCH_MU((const int*)m.size, (const int*)um.size);
+}
+
+inline int Blob::type() const
+{
+    return CV_DNN_SWITCH_MU(m.type(), um.type());
+}
+
+template<int n>
+inline size_t Blob::offset(const Vec<int, n> &pos) const
+{
+    const MatStep &step = CV_DNN_SWITCH_MU(m.step, um.step);
+    size_t ofs = 0;
+    int i;
+    for (i = 0; i < std::min(n, dims()); i++)
+    {
+        CV_DbgAssert(pos[i] >= 0 && pos[i] < size(i));
+        ofs += step[i] * pos[i];
+    }
+    for (; i < dims(); i++)
+        CV_DbgAssert(pos[i] == 0);
+    CV_DbgAssert(ofs % elemSize() == 0);
+    return ofs / elemSize();
+}
+
+inline int Blob::canonicalAxis(int axis) const
+{
+    CV_Assert(-dims() <= axis && axis < dims());
+    return (axis < 0) ? axis + dims() : axis;
 }

 inline int Blob::xsize(int axis) const
@ -196,27 +337,11 @@ inline size_t Blob::total(int startAxis, int endAxis) const

    CV_Assert(0 <= startAxis && startAxis <= endAxis && endAxis <= dims());

-    size_t size = 1; //fix: assume that slice isn't empty
+    size_t cnt = 1; //fix: assume that slice isn't empty
    for (int i = startAxis; i < endAxis; i++)
-        size *= (size_t)sizes()[i];
+        cnt *= (size_t)sizes()[i];

-    return size;
-}
-
-
-template<int n>
-inline size_t Blob::offset(const Vec<int, n> &pos) const
-{
-    size_t ofs = 0;
-    int i;
-    for (i = 0; i < std::min(n, dims()); i++)
-    {
-        CV_DbgAssert(pos[i] >= 0 && pos[i] < size(i));
-        ofs = ofs * (size_t)size(i) + pos[i];
-    }
-    for (; i < dims(); i++)
-        ofs *= (size_t)size(i);
-    return ofs;
+    return cnt;
 }

 inline size_t Blob::offset(int n, int cn, int row, int col) const
@ -226,20 +351,20 @@ inline size_t Blob::offset(int n, int cn, int row, int col) const

 inline float *Blob::ptrf(int n, int cn, int row, int col)
 {
-    CV_Assert(type() == CV_32F);
-    return (float*)m.data + offset(n, cn, row, col);
+    return matRef(false).ptr<float>() + offset(n, cn, row, col);
 }

 inline uchar *Blob::ptr(int n, int cn, int row, int col)
 {
-    return m.data + m.elemSize() * offset(n, cn, row, col);
+    Mat &mat = matRef(false);
+    return mat.ptr() + mat.elemSize() * offset(n, cn, row, col);
 }

-template<typename TFloat>
-inline TFloat* Blob::ptr(int n, int cn, int row, int col)
+template<typename Dtype>
+inline Dtype* Blob::ptr(int n, int cn, int row, int col)
 {
-    CV_Assert(type() == cv::DataDepth<TFloat>::value);
-    return (TFloat*) ptr(n, cn, row, col);
+    CV_Assert(type() == cv::DataDepth<Dtype>::value);
+    return (Dtype*) ptr(n, cn, row, col);
 }

 inline BlobShape Blob::shape() const
@ -260,26 +385,69 @@ inline bool Blob::equalShape(const Blob &other) const
    return true;
 }

-inline Mat& Blob::matRef()
+inline Mat& Blob::matRef(bool writeOnly)
 {
+#ifdef CV_DNN_UMAT
+    updateMat(!writeOnly);
+    state = HEAD_AT_MAT;
+#else
+    (void)writeOnly;
+#endif
    return m;
 }

 inline const Mat& Blob::matRefConst() const
 {
+    CV_DNN_UMAT_ONLY( updateMat() );
    return m;
 }

-inline UMat &Blob::umatRef()
+inline UMat &Blob::umatRef(bool writeOnly)
 {
-    CV_Error(Error::StsNotImplemented, "");
+#ifndef CV_DNN_UMAT
+    CV_Error(Error::GpuNotSupported, "");
+    (void)writeOnly;
    return *(new UMat());
+#else
+    updateUMat(!writeOnly);
+    state = HEAD_AT_UMAT;
+    return um;
+#endif
 }

 inline const UMat &Blob::umatRefConst() const
 {
-    CV_Error(Error::StsNotImplemented, "");
+#ifndef CV_DNN_UMAT
+    CV_Error(Error::GpuNotSupported, "");
    return *(new UMat());
+#else
+    updateUMat();
+    return um;
+#endif
+}
+
+template<>
+inline Mat &Blob::getRef<Mat>(bool writeOnly)
+{
+    return matRef(writeOnly);
+}
+
+template<>
+inline UMat &Blob::getRef<UMat>(bool writeOnly)
+{
+    return umatRef(writeOnly);
+}
+
+template<>
+inline const Mat &Blob::getRefConst<Mat>() const
+{
+    return matRefConst();
+}
+
+template<>
+inline const UMat &Blob::getRefConst<UMat>() const
+{
+    return umatRefConst();
 }

 inline Mat Blob::getPlane(int n, int cn)
@ -313,27 +481,44 @@ inline Size Blob::size2() const
    return Size(cols(), rows());
 }

-inline int Blob::type() const
+inline Blob &Blob::shareFrom(const Blob &blob)
 {
-    return m.depth();
+    this->m = blob.m;
+#ifdef CV_DNN_UMAT
+    this->um = blob.um;
+    this->state = blob.state;
+#endif
+    return *this;
 }

-inline const int * Blob::sizes() const
+inline Blob &Blob::reshape(const BlobShape &newShape)
 {
-    return &m.size[0];
+    if (!m.empty()) m = m.reshape(1, newShape.dims(), newShape.ptr());
+#ifdef CV_DNN_UMAT
+    if (!um.empty()) um = um.reshape(1, newShape.dims(), newShape.ptr());
+#endif
+    return *this;
 }

+inline Blob Blob::reshaped(const BlobShape &newShape) const
+{
+    Blob res(*this); //also, res.shareFrom(*this) could be used
+    res.reshape(newShape);
+    return res;
+}

-inline Blob &Blob::shareFrom(const Blob &blob)
+inline int Blob::elemSize() const
 {
-    this->m = blob.m;
-    return *this;
+    return CV_ELEM_SIZE(type());
 }

-inline Blob &Blob::reshape(const BlobShape &shape)
+inline int Blob::getState() const
 {
-    m = m.reshape(1, shape.dims(), shape.ptr());
-    return *this;
+#ifdef CV_DNN_UMAT
+    return this->state;
+#else
+    return m.empty() ? UNINITIALIZED : HEAD_AT_MAT;
+#endif
 }

 }
--- a/modules/dnn/include/opencv2/dnn/dict.hpp
+++ b/modules/dnn/include/opencv2/dnn/dict.hpp
@ -95,10 +95,10 @@ private:
        AutoBuffer<int64, 1> *pi;
        AutoBuffer<double, 1> *pd;
        AutoBuffer<String, 1> *ps;
-        void *p;
+        void *pv;
    };

-    DictValue(int _type, void *_p) : type(_type), p(_p) {}
+    DictValue(int _type, void *_p) : type(_type), pv(_p) {}
    void release();
 };

--- a/modules/dnn/include/opencv2/dnn/dnn.hpp
+++ b/modules/dnn/include/opencv2/dnn/dnn.hpp
@ -59,15 +59,17 @@ namespace dnn //! This namespace is used for dnn module functionlaity.
     * This function automatically called on most of OpenCV builds,
     * but you need to call it manually on some specific configurations (iOS for example).
     */
-    CV_EXPORTS void initModule();
+    CV_EXPORTS_W void initModule();

    /** @brief This class provides all data needed to initialize layer.
     *
     * It includes dictionary with scalar params (which can be readed by using Dict interface),
     * blob params #blobs and optional meta information: #name and #type of layer instance.
    */
-    struct CV_EXPORTS LayerParams : public Dict
+    class CV_EXPORTS LayerParams : public Dict
    {
+    public:
+        //TODO: Add ability to name blob params
        std::vector<Blob> blobs; //!< List of learned parameters stored as blobs.

        String name; //!< Name of the layer instance (optional, can be used internal purposes).
@ -77,10 +79,12 @@ namespace dnn //! This namespace is used for dnn module functionlaity.
    /** @brief This interface class allows to build new Layers - are building blocks of networks.
     *
     * Each class, derived from Layer, must implement allocate() methods to declare own outputs and forward() to compute outputs.
-     * Also before using the new layer into networks you must register your layer by using one of @ref LayerFactoryModule "LayerFactory" macros.
+     * Also before using the new layer into networks you must register your layer by using one of @ref dnnLayerFactory "LayerFactory" macros.
     */
-    struct CV_EXPORTS Layer
+    class CV_EXPORTS_W Layer
    {
+    public:
+
        //! List of learned parameters must be stored here to allow read them by using Net::getParam().
        std::vector<Blob> blobs;

@ -116,7 +120,8 @@ namespace dnn //! This namespace is used for dnn module functionlaity.
        String type; //!< Type name which was used for creating layer by layer factory.

        Layer();
-        explicit Layer(const LayerParams &params); //!< Initialize only #name, #type and #blobs fields.
+        explicit Layer(const LayerParams &params);      //!< Initializes only #name, #type and #blobs fields.
+        void setParamsFrom(const LayerParams &params);  //!< Initializes only #name, #type and #blobs fields.
        virtual ~Layer();
    };

@ -130,7 +135,7 @@ namespace dnn //! This namespace is used for dnn module functionlaity.
     *
     * This class supports reference counting of its instances, i. e. copies point to the same instance.
     */
-    class CV_EXPORTS Net
+    class CV_EXPORTS_W Net
    {
    public:

@ -174,6 +179,7 @@ namespace dnn //! This namespace is used for dnn module functionlaity.
         *  @see setNetInputs(), Layer::inputNameToIndex(), Layer::outputNameToIndex()
         */
        void connect(String outPin, String inpPin);
+
        /** @brief Connects #@p outNum output of the first layer to #@p inNum input of the second layer.
         *  @param outLayerId identifier of the first layer
         *  @param inpLayerId identifier of the second layer
@ -181,6 +187,7 @@ namespace dnn //! This namespace is used for dnn module functionlaity.
         *  @param inpNum number of the second layer input
         */
        void connect(int outLayerId, int outNum, int inpLayerId, int inpNum);
+
        /** @brief Sets ouputs names of the network input pseudo layer.
         *
         * Each net always has special own the network input pseudo layer with id=0.
@ -267,10 +274,10 @@ namespace dnn //! This namespace is used for dnn module functionlaity.
     *  @param isBinary specifies whether the network was serialized in ascii mode or binary.
     *  @returns Pointer to the created importer, NULL in failure cases.
     *
-     *  @warning Torch7 importer is experimental now, you need explicitly set CMake opencv_dnn_BUILD_TORCH_IMPORTER flag to compile its.
+     *  @warning Torch7 importer is experimental now, you need explicitly set CMake `opencv_dnn_BUILD_TORCH_IMPORTER` flag to compile its.
     *
-     *  @note Ascii mode of Torch serializer is more preferable, because binary mode extensively use long type of C language,
-     *  which has different bit-length on different systems.
+     *  @note Ascii mode of Torch serializer is more preferable, because binary mode extensively use `long` type of C language,
+     *  which has various bit-length on different systems.
     *
     * The loading file must contain serialized <a href="https://github.com/torch/nn/blob/master/doc/module.md">nn.Module</a> object
     * with importing network. Try to eliminate a custom objects from serialazing data to avoid importing errors.
--- a/modules/dnn/include/opencv2/dnn/dnn.inl.hpp
+++ b/modules/dnn/include/opencv2/dnn/dnn.inl.hpp
@ -86,7 +86,7 @@ inline DictValue DictValue::get<DictValue>(int idx) const
 template<>
 inline int64 DictValue::get<int64>(int idx) const
 {
-    CV_Assert(idx == -1 && size() == 1 || idx >= 0 && idx < size());
+    CV_Assert((idx == -1 && size() == 1) || (idx >= 0 && idx < size()));
    idx = (idx == -1) ? 0 : idx;

    if (type == Param::INT)
@ -131,7 +131,7 @@ inline bool DictValue::get<bool>(int idx) const
 template<>
 inline double DictValue::get<double>(int idx) const
 {
-    CV_Assert(idx == -1 && size() == 1 || idx >= 0 && idx < size());
+    CV_Assert((idx == -1 && size() == 1) || (idx >= 0 && idx < size()));
    idx = (idx == -1) ? 0 : idx;

    if (type == Param::REAL)
@ -159,7 +159,7 @@ template<>
 inline String DictValue::get<String>(int idx) const
 {
    CV_Assert(isString());
-    CV_Assert(idx == -1 && ps->size() == 1 || idx >= 0 && idx < (int)ps->size());
+    CV_Assert((idx == -1 && ps->size() == 1) || (idx >= 0 && idx < (int)ps->size()));
    return (*ps)[(idx == -1) ? 0 : idx];
 }

--- a/modules/dnn/include/opencv2/dnn/layer.hpp
+++ b/modules/dnn/include/opencv2/dnn/layer.hpp
@ -50,7 +50,7 @@ namespace dnn
 //! @addtogroup dnn
 //! @{
 //!
-//! @defgroup LayerFactoryModule Utilities for new layers registration
+//! @defgroup dnnLayerFactory Utilities for New Layers Registration
 //! @{

 /** @brief %Layer factory allows to create instances of registered layers. */
@ -86,7 +86,7 @@ private:
 *   @details This macros must be placed inside the function code.
 */
 #define REG_RUNTIME_LAYER_FUNC(type, constuctorFunc) \
-    LayerFactory::registerLayer(#type, constuctorFunc);
+    cv::dnn::LayerFactory::registerLayer(#type, constuctorFunc);

 /** @brief Registers layer class in runtime.
 *  @param type string, containing type name of the layer.
@ -94,7 +94,7 @@ private:
 *  @details This macros must be placed inside the function code.
 */
 #define REG_RUNTIME_LAYER_CLASS(type, class) \
-    LayerFactory::registerLayer(#type, _layerDynamicRegisterer<class>);
+    cv::dnn::LayerFactory::registerLayer(#type, _layerDynamicRegisterer<class>);

 /** @brief Registers layer constructor on module load time.
 *   @param type string, containing type name of the layer.
@ -102,7 +102,7 @@ private:
 *   @details This macros must be placed outside the function code.
 */
 #define REG_STATIC_LAYER_FUNC(type, constuctorFunc) \
-static _LayerStaticRegisterer __LayerStaticRegisterer_##type(#type, constuctorFunc);
+static cv::dnn::_LayerStaticRegisterer __LayerStaticRegisterer_##type(#type, constuctorFunc);

 /** @brief Registers layer class on module load time.
 *  @param type string, containing type name of the layer.
@ -126,14 +126,15 @@ Ptr<Layer> _layerDynamicRegisterer(LayerParams &params)
 }

 //allows automatically register created layer on module load time
-struct _LayerStaticRegisterer
+class _LayerStaticRegisterer
 {
    String type;
+public:

-    _LayerStaticRegisterer(const String &type, LayerFactory::Constuctor constuctor)
+    _LayerStaticRegisterer(const String &layerType, LayerFactory::Constuctor layerConstuctor)
    {
-        this->type = type;
-        LayerFactory::registerLayer(type, constuctor);
+        this->type = layerType;
+        LayerFactory::registerLayer(layerType, layerConstuctor);
    }

    ~_LayerStaticRegisterer()
--- a/modules/dnn/include/opencv2/dnn/shape_utils.hpp
+++ b/modules/dnn/include/opencv2/dnn/shape_utils.hpp
@ -0,0 +1,137 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_DNN_DNN_SHAPE_UTILS_HPP__
+#define __OPENCV_DNN_DNN_SHAPE_UTILS_HPP__
+
+#include <opencv2/core.hpp>
+#include <ostream>
+
+namespace cv {
+namespace dnn {
+
+//Useful shortcut
+typedef BlobShape Shape;
+
+inline std::ostream &operator<< (std::ostream &s, cv::Range &r)
+{
+    return s << "[" << r.start << ", " << r.end << ")";
+}
+
+//Reshaping
+//TODO: add -1 specifier for automatic size inferring
+
+template<typename Mat>
+void reshape(Mat &m, const BlobShape &shape)
+{
+    m = m.reshape(1, shape.dims(), shape.ptr());
+}
+
+template<typename Mat>
+Mat reshaped(const Mat &m, const BlobShape &shape)
+{
+    return m.reshape(1, shape.dims(), shape.ptr());
+}
+
+
+//Slicing
+
+struct _Range : public cv::Range
+{
+    _Range(const Range &r) : cv::Range(r) {}
+    _Range(int start, int size = 1) : cv::Range(start, start + size) {}
+};
+
+template<typename Mat>
+Mat slice(const Mat &m, const _Range &r0)
+{
+    //CV_Assert(m.dims >= 1);
+    cv::AutoBuffer<cv::Range, 4> ranges(m.dims);
+    for (int i = 1; i < m.dims; i++)
+        ranges[i] = Range::all();
+    ranges[0] = r0;
+    return m(&ranges[0]);
+}
+
+template<typename Mat>
+Mat slice(const Mat &m, const _Range &r0, const _Range &r1)
+{
+    CV_Assert(m.dims >= 2);
+    cv::AutoBuffer<cv::Range, 4> ranges(m.dims);
+    for (int i = 2; i < m.dims; i++)
+        ranges[i] = Range::all();
+    ranges[0] = r0;
+    ranges[1] = r1;
+    return m(&ranges[0]);
+}
+
+template<typename Mat>
+Mat slice(const Mat &m, const _Range &r0, const _Range &r1, const _Range &r2)
+{
+    CV_Assert(m.dims <= 3);
+    cv::AutoBuffer<cv::Range, 4> ranges(m.dims);
+    for (int i = 3; i < m.dims; i++)
+        ranges[i] = Range::all();
+    ranges[0] = r0;
+    ranges[1] = r1;
+    ranges[2] = r2;
+    return m(&ranges[0]);
+}
+
+template<typename Mat>
+Mat slice(const Mat &m, const _Range &r0, const _Range &r1, const _Range &r2, const _Range &r3)
+{
+    CV_Assert(m.dims <= 4);
+    cv::AutoBuffer<cv::Range, 4> ranges(m.dims);
+    for (int i = 4; i < m.dims; i++)
+        ranges[i] = Range::all();
+    ranges[0] = r0;
+    ranges[1] = r1;
+    ranges[2] = r2;
+    ranges[3] = r3;
+    return m(&ranges[0]);
+}
+
+BlobShape computeShapeByReshapeMask(const BlobShape &srcShape, const BlobShape &maskShape, Range srcRange = Range::all());
+
+}
+}
+#endif
--- a/modules/dnn/perf/perf_convolution.cpp
+++ b/modules/dnn/perf/perf_convolution.cpp
@ -0,0 +1,80 @@
+#include "perf_precomp.hpp"
+
+namespace cvtest
+{
+
+using std::tr1::tuple;
+using std::tr1::get;
+using std::tr1::make_tuple;
+using std::make_pair;
+using namespace perf;
+using namespace testing;
+using namespace cv;
+using namespace cv::dnn;
+
+enum {STRIDE_OFF = 1, STRIDE_ON = 2};
+CV_ENUM(StrideSize, STRIDE_OFF, STRIDE_ON);
+
+enum {GROUP_OFF = 1, GROUP_2 = 2};
+CV_ENUM(GroupSize, GROUP_OFF, GROUP_2);
+
+//Squared Size
+#define SSZ(n) cv::Size(n, n)
+
+typedef std::pair<BlobShape, int> InpShapeNumOut;
+typedef tuple<Size, InpShapeNumOut, GroupSize, StrideSize> ConvParam; //kernel_size, inp shape, groups, stride
+typedef TestBaseWithParam<ConvParam> ConvolutionPerfTest;
+
+PERF_TEST_P( ConvolutionPerfTest, perf, Combine(
+    Values(Size(1, 1), Size(3, 3), Size(5, 5), Size(11, 11)),
+    Values(make_pair(BlobShape(1,   4, 224, 224),  64),
+           make_pair(BlobShape(1,  64, 112, 122), 128),
+           make_pair(BlobShape(1, 256,  28,  28), 512)),
+    GroupSize::all(),
+    StrideSize::all())
+)
+{
+    RNG rng(0);
+
+    ConvParam params = GetParam();
+    int ksz     = get<0>(params).width;
+    BlobShape inpShape = get<1>(params).first;
+    int outCn   = get<1>(params).second;
+    int groups  = get<2>(params);
+    int stride  = (ksz >= 11) ? 4 : (int)get<3>(params);
+
+    int inpCn = inpShape[1];
+    Blob wgtBlob(BlobShape(outCn, inpCn/groups, ksz, ksz)), biasBlob(BlobShape(outCn, 1, 1, 1));
+    Blob inpBlob(inpShape);
+    rng.fill(biasBlob.matRef(), RNG::UNIFORM, -1, +1);
+    rng.fill(wgtBlob.matRef(), RNG::UNIFORM, -1, +1);
+    rng.fill(inpBlob.matRef(), RNG::UNIFORM, -1, +1);
+
+    LayerParams lp;
+    lp.set("num_output", outCn);
+    lp.set("group", groups);
+    lp.set("stride", stride);
+    lp.set("kernel_size", ksz);
+    lp.blobs.reserve(2);
+    lp.blobs.push_back(wgtBlob);
+    lp.blobs.push_back(biasBlob);
+
+    std::vector<Blob*> inpBlobs(1, &inpBlob);
+    std::vector<Blob> outBlobs;
+
+    cv::setNumThreads(cv::getNumberOfCPUs());
+
+    Ptr<Layer> layer = cv::dnn::LayerFactory::createLayerInstance("Convolution", lp);
+    layer->allocate(inpBlobs, outBlobs);
+
+    declare.in(inpBlob.matRef(), wgtBlob.matRef(), WARMUP_RNG).out(outBlobs[0].matRef()).tbb_threads(cv::getNumThreads());
+
+    TEST_CYCLE_N(10)
+    {
+        layer->forward(inpBlobs, outBlobs);
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+}
--- a/modules/dnn/perf/perf_main.cpp
+++ b/modules/dnn/perf/perf_main.cpp
@ -0,0 +1,3 @@
+#include "perf_precomp.hpp"
+
+CV_PERF_TEST_MAIN(dnn)
--- a/modules/dnn/perf/perf_precomp.hpp
+++ b/modules/dnn/perf/perf_precomp.hpp
@ -0,0 +1,17 @@
+#ifdef __GNUC__
+#  pragma GCC diagnostic ignored "-Wmissing-declarations"
+#  if defined __clang__ || defined __APPLE__
+#    pragma GCC diagnostic ignored "-Wmissing-prototypes"
+#    pragma GCC diagnostic ignored "-Wextra"
+#  endif
+#endif
+
+#ifndef __OPENCV_PERF_PRECOMP_HPP__
+#define __OPENCV_PERF_PRECOMP_HPP__
+
+#include <opencv2/ts.hpp>
+#include <opencv2/imgproc.hpp>
+#include <opencv2/highgui.hpp>
+#include <opencv2/dnn.hpp>
+
+#endif
--- a/modules/dnn/samples/.gitignore
+++ b/modules/dnn/samples/.gitignore
@ -0,0 +1 @@
+*.caffemodel
--- a/modules/dnn/samples/caffe_googlenet.cpp
+++ b/modules/dnn/samples/caffe_googlenet.cpp
@ -124,8 +124,8 @@ int main(int argc, char **argv)
        exit(-1);
    }

-    resize(img, img, Size(224, 224));       //GoogLeNet accepts only 224x224 RGB-images
-    dnn::Blob inputBlob = dnn::Blob(img);   //Convert Mat to dnn::Blob image batch
+    resize(img, img, Size(224, 224));                   //GoogLeNet accepts only 224x224 RGB-images
+    dnn::Blob inputBlob = dnn::Blob::fromImages(img);   //Convert Mat to dnn::Blob batch of images
    //! [Prepare blob]

    //! [Set input blob]
--- a/modules/dnn/src/blob.cpp
+++ b/modules/dnn/src/blob.cpp
@ -40,179 +40,383 @@
 //M*/

 #include "precomp.hpp"
+#include <opencv2/dnn/shape_utils.hpp>

 namespace cv
 {
 namespace dnn
 {

-    Blob::Blob()
-    {
-        int zeros[4] = { 0, 0, 0, 0 };
-        m = Mat(4, zeros, CV_32F, NULL);
-    }
+Blob::Blob()
+{
+    CV_DNN_UMAT_ONLY(state = UNINITIALIZED);
+}

-    static inline int getMatChannels(const Mat &mat)
+Blob::Blob(const BlobShape &shape, int type, int allocFlags)
+{
+    CV_DNN_UMAT_ONLY(state = UNINITIALIZED);
+    this->create(shape, type, allocFlags);
+}
+
+Blob::Blob(InputArray data)
+{
+#ifndef CV_DNN_UMAT
+    m = data.getMat();
+#else
+    CV_Assert(data.isMat() || data.isUMat());
+    if (data.isMat())
    {
-       return (mat.dims <= 2) ? mat.channels() : mat.size[0];
+        m = data.getMat();
+        state = HEAD_AT_MAT;
    }
-
-    static BlobShape getBlobShape(std::vector<Mat> &vmat, int requestedCn = -1)
+    else
    {
-        BlobShape shape(4);
-        int cnSum = 0, matCn;
+        um = data.getUMat();
+        state = HEAD_AT_UMAT;
+    }
+#endif
+}

-        CV_Assert(vmat.size() > 0);
+void Blob::create(const BlobShape &shape, int type, int allocFlags)
+{
+#ifndef CV_DNN_UMAT
+    CV_Assert(allocFlags & ALLOC_MAT);
+    m.create(shape.dims(), shape.ptr(), type);
+#else
+    CV_Assert(allocFlags & ALLOC_MAT || allocFlags & ALLOC_UMAT);

-        for (size_t i = 0; i < vmat.size(); i++)
-        {
-            Mat &mat = vmat[i];
-            CV_Assert(!mat.empty());
-            CV_Assert((mat.dims == 3 && mat.channels() == 1) || mat.dims <= 2);
+    if (allocFlags & ALLOC_MAT)
+        m.create(shape.dims(), shape.ptr(), type);
+    if (allocFlags & ALLOC_UMAT)
+        um.create(shape.dims(), shape.ptr(), type);

-            matCn = getMatChannels(mat);
-            cnSum += getMatChannels(mat);
+    if (state == UNINITIALIZED)
+    {
+        if (allocFlags & ALLOC_MAT && allocFlags & ALLOC_UMAT)
+            state = SYNCED;
+        else if (allocFlags & ALLOC_MAT)
+            state = HEAD_AT_MAT;
+        else
+            state = HEAD_AT_UMAT;
+    }
+#endif
+}

-            if (i == 0)
-            {
-                shape[-1] = mat.cols;
-                shape[-2] = mat.rows;
-                shape[-3] = (requestedCn <= 0) ? matCn : requestedCn;
-            }
-            else
-            {
-                if (mat.cols != shape[-1] || mat.rows != shape[-2])
-                    CV_Error(Error::StsError, "Each Mat.size() must be equal");
+void Blob::fill(InputArray in)
+{
+#ifdef CV_DNN_UMAT
+    CV_Assert(in.isMat() || in.isUMat());
+    if (in.isMat())
+    {
+        m = in.getMat();
+        state = HEAD_AT_MAT;
+    }
+    else
+    {
+        um = in.getUMat();
+        state = HEAD_AT_UMAT;
+    }
+#else
+    CV_Assert(in.isMat());
+    m = in.getMat();
+#endif
+}

-                if (requestedCn <= 0 && matCn != shape[-3])
-                    CV_Error(Error::StsError, "Each Mat.chnannels() (or number of planes) must be equal");
-            }
-        }
+static inline int getMatChannels(const Mat &mat)
+{
+    return (mat.dims <= 2) ? mat.channels() : mat.size[0];
+}

-        if (cnSum % shape[-3] != 0)
-            CV_Error(Error::StsError, "Total number of channels in vector is not a multiple of requsted channel number");
+static BlobShape getBlobShape(std::vector<Mat> &vmat, int requestedCn = -1)
+{
+    BlobShape shape(BlobShape::all(4));
+    int cnSum = 0, matCn;

-        shape[0] = cnSum / shape[-3];
-        return shape;
-    }
+    CV_Assert(vmat.size() > 0);

-    static std::vector<Mat> extractMatVector(InputArray in)
+    for (size_t i = 0; i < vmat.size(); i++)
    {
-        if (in.isMat() || in.isUMat())
-        {
-            return std::vector<Mat>(1, in.getMat());
-        }
-        else if (in.isMatVector())
-        {
-            return *static_cast<const std::vector<Mat>*>(in.getObj());
-        }
-        else if (in.isUMatVector())
+        Mat &mat = vmat[i];
+        CV_Assert(!mat.empty());
+        CV_Assert((mat.dims == 3 && mat.channels() == 1) || mat.dims <= 2);
+
+        matCn = getMatChannels(mat);
+        cnSum += getMatChannels(mat);
+
+        if (i == 0)
        {
-            std::vector<Mat> vmat;
-            in.getMatVector(vmat);
-            return vmat;
+            shape[-1] = mat.cols;
+            shape[-2] = mat.rows;
+            shape[-3] = (requestedCn <= 0) ? matCn : requestedCn;
        }
        else
        {
-            CV_Assert(in.isMat() || in.isMatVector() || in.isUMat() || in.isUMatVector());
-            return std::vector<Mat>();
+            if (mat.cols != shape[-1] || mat.rows != shape[-2])
+                CV_Error(Error::StsError, "Each Mat.size() must be equal");
+
+            if (requestedCn <= 0 && matCn != shape[-3])
+                CV_Error(Error::StsError, "Each Mat.chnannels() (or number of planes) must be equal");
        }
    }

-    Blob::Blob(InputArray image, int dstCn)
+    if (cnSum % shape[-3] != 0)
+        CV_Error(Error::StsError, "Total number of channels in vector is not a multiple of requsted channel number");
+
+    shape[0] = cnSum / shape[-3];
+    return shape;
+}
+
+static std::vector<Mat> extractMatVector(InputArray in)
+{
+    if (in.isMat() || in.isUMat())
+    {
+        return std::vector<Mat>(1, in.getMat());
+    }
+    else if (in.isMatVector())
+    {
+        return *static_cast<const std::vector<Mat>*>(in.getObj());
+    }
+    else if (in.isUMatVector())
    {
-        CV_Assert(dstCn == -1 || dstCn > 0);
-        std::vector<Mat> inMats = extractMatVector(image);
-        BlobShape dstShape = getBlobShape(inMats, dstCn);
+        std::vector<Mat> vmat;
+        in.getMatVector(vmat);
+        return vmat;
+    }
+    else
+    {
+        CV_Assert(in.isMat() || in.isMatVector() || in.isUMat() || in.isUMatVector());
+        return std::vector<Mat>();
+    }
+}

-        m.create(dstShape.dims(), dstShape.ptr(), CV_32F);
+void Blob::batchFromImages(InputArray image, int dstCn)
+{
+    CV_Assert(dstCn == -1 || dstCn > 0);
+    std::vector<Mat> inMats = extractMatVector(image);
+    BlobShape dstShape = getBlobShape(inMats, dstCn);

-        std::vector<Mat> wrapBuf(dstShape[-3]);
-        int elemSize = (int)m.elemSize();
-        uchar *ptr = this->ptr();
-        for (size_t i = 0; i < inMats.size(); i++)
-        {
-            Mat inMat = inMats[i];
+    int dtype = CV_32F;
+    this->create(dstShape, dtype, ALLOC_MAT);
+    uchar *dstPtr = this->matRef().ptr();
+    int elemSize = CV_ELEM_SIZE(dtype);

-            if (inMat.dims <= 2)
-            {
-                inMat.convertTo(inMat, m.type());
+    std::vector<Mat> wrapBuf(dstShape[-3]);
+    for (size_t i = 0; i < inMats.size(); i++)
+    {
+        Mat inMat = inMats[i];

-                wrapBuf.resize(0);
-                for (int cn = 0; cn < inMat.channels(); cn++)
-                {
-                    wrapBuf.push_back(Mat(inMat.rows, inMat.cols, m.type(), ptr));
-                    ptr += elemSize * inMat.total();
-                }
+        if (inMat.dims <= 2)
+        {
+            inMat.convertTo(inMat, dtype);

-                cv::split(inMat, wrapBuf);
-            }
-            else
+            wrapBuf.resize(0);
+            for (int cn = 0; cn < inMat.channels(); cn++)
            {
-                inMat.convertTo(Mat(inMat.dims, inMat.size, m.type(), ptr), m.type());
-                ptr += elemSize * inMat.total();
+                wrapBuf.push_back(Mat(inMat.rows, inMat.cols, dtype, dstPtr));
+                dstPtr += elemSize * inMat.total();
            }
+
+            cv::split(inMat, wrapBuf);
+        }
+        else
+        {
+            inMat.convertTo(Mat(inMat.dims, inMat.size, dtype, dstPtr), dtype);
+            dstPtr += elemSize * inMat.total();
        }
    }
+}
+
+Blob Blob::fromImages(InputArray image, int dstCn)
+{
+    Blob res;
+    res.batchFromImages(image, dstCn);
+    return res;
+}

-    Blob::Blob(const BlobShape &shape, int type)
+void Blob::fill(const BlobShape &shape, int type, void *data, bool deepCopy)
+{
+    if (deepCopy)
    {
-        this->create(shape, type);
+        create(shape, type);
+        memcpy(ptr(), data, this->total() * CV_ELEM_SIZE(type));
    }
-
-    void Blob::fill(const BlobShape &shape, int type, void *data, bool deepCopy)
+    else
    {
-        CV_Assert(type == CV_32F || type == CV_64F);
+        m = Mat(shape.dims(), shape.ptr(), type, data);
+    }
+    CV_DNN_UMAT_ONLY(state = HEAD_AT_MAT);
+}

-        if (deepCopy)
-        {
-            m.create(shape.dims(), shape.ptr(), type);
-            memcpy(m.data, data, m.total() * m.elemSize());
-        }
-        else
+void Blob::setTo(InputArray value, int allocFlags)
+{
+#ifdef CV_DNN_UMAT
+    if (allocFlags == -1)
+    {
+        if (state == HEAD_AT_UMAT)
+            um.setTo(value);
+        else if (state == HEAD_AT_MAT)
+            m.setTo(value);
+        else //SYNCED or UNINITIALIZED
        {
-            m = Mat(shape.dims(), shape.ptr(), type, data);
+            um.setTo(value);
+            m.setTo(value);
+
+            if (state == UNINITIALIZED)
+                state = SYNCED;
        }
    }
+    else if (allocFlags == ALLOC_BOTH)
+    {
+        m.setTo(value);
+        um.setTo(value);
+        state = SYNCED;
+    }
+    else if (allocFlags == ALLOC_MAT)
+    {
+        matRef().setTo(value);
+    }
+    else if (allocFlags == ALLOC_UMAT)
+    {
+        umatRef().setTo(value);
+    }
+    else
+    {
+        CV_Error(Error::StsBadArg, "allocFlags sholud be -1 or one of Blob::AllocFlag values");
+    }
+#else
+    m.setTo(value);
+#endif
+}

-    void Blob::create(const BlobShape &shape, int type)
+void Blob::updateMat(bool syncData) const
+{
+#ifdef CV_DNN_UMAT
+    if (state == UNINITIALIZED || state == SYNCED || state == HEAD_AT_MAT)
    {
-        CV_Assert(type == CV_32F || type == CV_64F);
-        m.create(shape.dims(), shape.ptr(), type);
+        return;
+    }
+    else if (state == HEAD_AT_UMAT)
+    {
+        if (syncData)
+            um.copyTo(m);
+        else
+            m.create(dims(), sizes(), type());
+        state = SYNCED;
+    }
+    else
+    {
+        CV_Error(Error::StsInternal, "");
    }
+#else
+    (void)syncData;
+#endif
+}

-    inline void squeezeShape(const int srcDims, const int *srcSizes, const int dstDims, int *dstSizes)
+void Blob::updateUMat(bool syncData) const
+{
+#ifdef CV_DNN_UMAT
+    if (state == UNINITIALIZED || state == SYNCED || state == HEAD_AT_UMAT)
+    {
+        return;
+    }
+    else if (state == HEAD_AT_MAT)
    {
-        const int m = std::min(dstDims, srcDims);
+        if (syncData)
+            m.copyTo(um);
+        else
+            um.create(dims(), sizes(), type());
+    }
+    else
+    {
+        CV_Error(Error::StsInternal, "");
+    }
+#else
+    (void)syncData;
+#endif
+}
+
+void Blob::sync() const
+{
+    updateMat();
+    updateUMat();
+}

-        //copy common(last) dimensions
-        for (int i = 0; i < m; i++)
-            dstSizes[dstDims - 1 - i] = srcSizes[srcDims - 1 - i];
+Vec4i Blob::shape4() const
+{
+    return Vec4i(num(), channels(), rows(), cols());
+}

-        //either flatten extra dimensions
-        for (int i = m; i < srcDims; i++)
-            dstSizes[0] *= srcSizes[srcDims - 1 - i];
+//BlobShape

-        //either fill gaps
-        for (int i = m; i < dstDims; i++)
-            dstSizes[dstDims - 1 - i] = 1;
-    }
+std::ostream &operator<< (std::ostream &stream, const BlobShape &shape)
+{
+    stream << "[";

-    Vec4i Blob::shape4() const
+    for (int i = 0; i < shape.dims() - 1; i++)
+        stream << shape[i] << ", ";
+    if (shape.dims() > 0)
+        stream << shape[-1];
+
+    return stream << "]";
+}
+
+BlobShape computeShapeByReshapeMask(const BlobShape &srcShape, const BlobShape &maskShape, Range srcRange /*= Range::all()*/)
+{
+    if (srcRange == Range::all())
+        srcRange = Range(0, srcShape.dims());
+    else
    {
-        return Vec4i(num(), channels(), rows(), cols());
+        int sz = srcRange.size();
+        srcRange.start = srcShape.canonicalAxis(srcRange.start);
+        srcRange.end =  (srcRange.end == INT_MAX) ? srcShape.dims() : srcRange.start + sz;
    }

-    std::ostream &operator<< (std::ostream &stream, const BlobShape &shape)
+    CV_Assert(0 <= srcRange.start && srcRange.start <= srcRange.end && srcRange.end <= srcShape.dims());
+    BlobShape dstShape(srcShape.dims() - srcRange.size() + maskShape.dims(), (const int*)NULL);
+
+    std::copy(srcShape.ptr(), srcShape.ptr() + srcRange.start, dstShape.ptr());
+    std::copy(srcShape.ptr() + srcRange.end, srcShape.ptr() + srcShape.dims(), dstShape.ptr() + srcRange.start + maskShape.dims());
+
+    int inferDim = -1;
+    for (int i = 0; i < maskShape.dims(); i++)
    {
-        stream << "[";
+        if (maskShape[i] > 0)
+        {
+            dstShape[srcRange.start + i] = maskShape[i];
+        }
+        else if (maskShape[i] == 0)
+        {
+            if (srcRange.start + i >= srcShape.dims())
+                CV_Error(Error::StsBadArg, format("Copy dim[%d] (which has zero size) is out of the source shape bounds", srcRange.start + i));
+            dstShape[srcRange.start + i] = srcShape[srcRange.start + i];
+        }
+        else if (maskShape[i] == -1)
+        {
+            if (inferDim != -1)
+                CV_Error(Error::StsAssert, "Duplicate of inferred dim (which is denoted by -1)");
+            inferDim = srcRange.start + i;
+            dstShape[inferDim] = 1;
+        }
+        else
+            CV_Error(Error::StsBadArg, "maskShape[i] >= -1");
+    }

-        for (int i = 0; i < shape.dims() - 1; i++)
-            stream << shape[i] << ", ";
-        if (shape.dims() > 0)
-            stream << shape[-1];
+    if (inferDim != -1)
+    {
+        ptrdiff_t srcTotal = srcShape.total();
+        ptrdiff_t dstTotal = dstShape.total();
+        if (srcTotal % dstTotal != 0)
+            CV_Error(Error::StsBackTrace, "Can't infer a dim denoted by -1");

-        return stream << "]";
+        dstShape[inferDim] = (int)(srcTotal / dstTotal);
+    }
+    else
+    {
+        CV_Assert(srcShape.total() == dstShape.total());
    }
+
+    return dstShape;
+}
+
 }
 }
--- a/modules/dnn/src/caffe/caffe_importer.cpp
+++ b/modules/dnn/src/caffe/caffe_importer.cpp
@ -191,7 +191,7 @@ namespace
            else if (pbBlob.has_shape())
            {
                const caffe::BlobShape &_shape = pbBlob.shape();
-                BlobShape shape(_shape.dim_size());
+                BlobShape shape = BlobShape::all(_shape.dim_size());

                for (int i = 0; i < _shape.dim_size(); i++)
                    shape[i] = (int)_shape.dim(i);
@ -201,7 +201,7 @@ namespace
            else
            {
                CV_Error(Error::StsError, "Unknown shape of input blob");
-                return BlobShape(-1);
+                return BlobShape();
            }
        }

--- a/modules/dnn/src/caffe/layer_loaders.cpp
+++ b/modules/dnn/src/caffe/layer_loaders.cpp
@ -0,0 +1,294 @@
+#include "../precomp.hpp"
+#include "layer_loaders.hpp"
+#include <opencv2/dnn/shape_utils.hpp>
+#include <climits>
+
+namespace cv
+{
+namespace dnn
+{
+
+//Utils
+
+//Extracts params used into Conv, Deconv and Pooling layers
+static void getCaffeConvParams(LayerParams &params, Size &kernel, Size &pad, Size &stride)
+{
+    if (params.has("kernel_h") && params.has("kernel_w"))
+    {
+        kernel.height = params.get<int>("kernel_h");
+        kernel.width = params.get<int>("kernel_w");
+    }
+    else if (params.has("kernel_size"))
+    {
+        kernel.height = kernel.width = params.get<int>("kernel_size");
+    }
+    else
+    {
+        CV_Error(Error::StsBadArg, "kernel_size (or kernel_h and kernel_w) not specified");
+    }
+    CV_Assert(kernel.height > 0 && kernel.width > 0);
+
+    if (params.has("pad_h") && params.has("pad_w"))
+    {
+        pad.height = params.get<int>("pad_h");
+        pad.width = params.get<int>("pad_w");
+    }
+    else
+    {
+        pad.height = pad.width = params.get<int>("pad", 0);
+    }
+    CV_Assert(pad.height >= 0 && pad.width >= 0);
+
+    if (params.has("stride_h") && params.has("stride_w"))
+    {
+        stride.height = params.get<int>("stride_h");
+        stride.width = params.get<int>("stride_w");
+    }
+    else
+    {
+        stride.height = stride.width = params.get<int>("stride", 1);
+    }
+    CV_Assert(stride.height > 0 && stride.width > 0);
+}
+
+//Layers
+
+//Convolution and Deconvolution
+static void initConvDeconvLayerFromCaffe(Ptr<BaseConvolutionLayer> l, LayerParams &params)
+{
+    l->setParamsFrom(params);
+    getCaffeConvParams(params, l->kernel, l->pad, l->stride);
+
+    bool bias = params.get<bool>("bias_term", true);
+    int numOutput = params.get<int>("num_output");
+    int group = params.get<int>("group", 1);
+
+    CV_Assert(numOutput % group == 0);
+    CV_Assert((bias && l->blobs.size() == 2) || (!bias && l->blobs.size() == 1));
+}
+
+template<>
+Ptr<Layer> createLayerFromCaffe<ConvolutionLayer>(LayerParams &params)
+{
+    Ptr<BaseConvolutionLayer> l = ConvolutionLayer::create();
+    initConvDeconvLayerFromCaffe(l, params);
+    return Ptr<Layer>(l);
+}
+
+template<>
+Ptr<Layer> createLayerFromCaffe<DeconvolutionLayer>(LayerParams &params)
+{
+    Ptr<BaseConvolutionLayer> l = DeconvolutionLayer::create();
+    initConvDeconvLayerFromCaffe(l, params);
+    return Ptr<Layer>(l);
+}
+
+template<>
+Ptr<Layer> createLayerFromCaffe<PoolingLayer>(LayerParams &params)
+{
+    int type;
+    Size kernel, stride, pad;
+
+    if (params.has("pool"))
+    {
+        String pool = params.get<String>("pool").toLowerCase();
+        if (pool == "max")
+            type = PoolingLayer::MAX;
+        else if (pool == "ave")
+            type = PoolingLayer::AVE;
+        else if (pool == "stochastic")
+            type = PoolingLayer::STOCHASTIC;
+        else
+            CV_Error(Error::StsBadArg, "Unknown pooling type \"" + pool + "\"");
+    }
+    else
+    {
+        type = PoolingLayer::MAX;
+    }
+
+    getCaffeConvParams(params, kernel, pad, stride);
+
+    return Ptr<Layer>(PoolingLayer::create(type, kernel, stride, pad));
+}
+
+template<>
+Ptr<Layer> createLayerFromCaffe<SoftmaxLayer>(LayerParams &params)
+{
+    int axis = params.get<int>("axis", 1);
+    return Ptr<Layer>(SoftmaxLayer::create(axis));
+}
+
+template<> //InnerProduct specialization
+Ptr<Layer> createLayerFromCaffe<InnerProductLayer>(LayerParams &params)
+{
+    const std::vector<Blob> &blobs = params.blobs;
+    CV_Assert(1 <= blobs.size() && blobs.size() <= 2);
+
+    int numOutputs = params.get<int>("num_output");
+    int innerSize = (int)blobs[0].total() / numOutputs;
+    bool bias = params.get<bool>("bias_term", true);
+    int axis = params.get<int>("axis", 1);
+
+    CV_Assert(blobs[0].dims() >= 2 && (size_t)(innerSize * numOutputs) == blobs[0].total());
+    CV_Assert(!bias || (blobs.size() == 2 && (size_t)numOutputs == blobs[1].total()));
+
+    Ptr<InnerProductLayer> l = InnerProductLayer::create(axis);
+    l->setParamsFrom(params);
+    l->blobs[0].reshape(Shape(numOutputs, innerSize));
+    if (bias)
+        l->blobs[1].reshape(Shape(1, numOutputs));
+
+    return Ptr<Layer>(l);
+}
+
+template<> //LRNLayer specialization
+Ptr<Layer> createLayerFromCaffe<LRNLayer>(LayerParams& params)
+{
+    int type;
+    String nrmType = params.get<String>("norm_region", "ACROSS_CHANNELS");
+    if (nrmType == "ACROSS_CHANNELS")
+        type = LRNLayer::CHANNEL_NRM;
+    else if (nrmType == "WITHIN_CHANNEL")
+        type = LRNLayer::SPATIAL_NRM;
+    else
+        CV_Error(Error::StsBadArg, "Unknown region type \"" + nrmType + "\"");
+
+    int size = params.get<int>("local_size", 5);
+    if (size % 2 != 1 || size <= 0)
+        CV_Error(Error::StsBadArg, "LRN layer supports only positive odd values for local_size");
+
+    double alpha = params.get<double>("alpha", 1);
+    double beta = params.get<double>("beta", 0.75);
+
+    return Ptr<Layer>(LRNLayer::create(type, size, alpha, beta));
+}
+
+template<>
+Ptr<Layer> createLayerFromCaffe<MVNLayer>(LayerParams &params)
+{
+    return Ptr<Layer>(MVNLayer::create(
+        params.get<bool>("normalize_variance", true),
+        params.get<bool>("across_channels", false),
+        params.get<double>("eps", 1e-9)
+    ));
+}
+
+/* Reshape layers */
+
+template<>
+Ptr<Layer> createLayerFromCaffe<ReshapeLayer>(LayerParams &params)
+{
+    int axis = params.get<int>("axis", 0);
+    int numAxes = params.get<int>("num_axes", -1);
+    CV_Assert(numAxes >= -1);
+    Range applyingRange = (numAxes == -1) ? Range(axis, INT_MAX) : Range(axis, axis + numAxes);
+
+    Shape newShape;
+    if (params.has("dim"))
+    {
+        const DictValue &paramShape = params.get("dim");
+        newShape = Shape::all(paramShape.size());
+        for (int i = 0; i < paramShape.size(); i++)
+            newShape[i] = paramShape.get<int>(i);
+    }
+    else
+        newShape = Shape::all(0);
+
+    return Ptr<Layer>(ReshapeLayer::create(newShape, applyingRange));
+}
+
+Ptr<Layer> createFlattenLayerFromCaffe(LayerParams&)
+{
+    return Ptr<Layer>(ReshapeLayer::create(Shape(0, -1)));
+}
+
+template<>
+Ptr<Layer> createLayerFromCaffe<ConcatLayer>(LayerParams& params)
+{
+    return Ptr<Layer>(ConcatLayer::create(params.get<int>("axis", 1)));
+}
+
+template<>
+Ptr<Layer> createLayerFromCaffe<SplitLayer>(LayerParams &params)
+{
+    int outputsCount;
+
+    //TODO: maybe "top_count" param is useless because it can be determined by output connections number
+    if (params.has("top_count"))
+    {
+        outputsCount = params.get<int>("top_count");
+        CV_Assert(outputsCount >= 0);
+    }
+    else
+    {
+        outputsCount = -1;
+    }
+
+    return Ptr<Layer>(SplitLayer::create(outputsCount));
+}
+
+template<>
+Ptr<Layer> createLayerFromCaffe<SliceLayer>(LayerParams& params)
+{
+    int axis = params.get<int>("axis", 1);
+
+    if (!params.has("slice_point"))
+    {
+        return Ptr<Layer>(SliceLayer::create(axis));
+    }
+    else
+    {
+        const DictValue &indicesValue = params.get("slice_point");
+        std::vector<int> sliceIndices(indicesValue.size());
+        for (int i = 0; i < indicesValue.size(); i++)
+            sliceIndices[i] = indicesValue.get<int>(i);
+
+        return Ptr<Layer>(SliceLayer::create(axis, sliceIndices));
+    }
+}
+
+/* Activation layers */
+
+template <typename ActivationLayer> //Intended for parameters-free activations
+Ptr<Layer> createLayerFromCaffe(LayerParams&)
+{
+    return Ptr<Layer>(ActivationLayer::create());
+}
+
+template<> //ReLU specialization
+Ptr<Layer> createLayerFromCaffe<ReLULayer>(LayerParams& params)
+{
+    float negative_slope = params.get<float>("negative_slope", 0.f);
+    return Ptr<Layer>(ReLULayer::create(negative_slope));
+}
+
+template<> //Power specialization
+Ptr<Layer> createLayerFromCaffe<PowerLayer>(LayerParams& params)
+{
+    float power = params.get<float>("power", 1.0f);
+    float scale = params.get<float>("scale", 1.0f);
+    float shift = params.get<float>("shift", 0.0f);
+    return Ptr<Layer>(PowerLayer::create(power, scale, shift));
+}
+
+//Explicit instantiation
+template Ptr<Layer> createLayerFromCaffe<ConvolutionLayer>(LayerParams&);
+template Ptr<Layer> createLayerFromCaffe<DeconvolutionLayer>(LayerParams&);
+template Ptr<Layer> createLayerFromCaffe<SoftmaxLayer>(LayerParams&);
+template Ptr<Layer> createLayerFromCaffe<InnerProductLayer>(LayerParams&);
+template Ptr<Layer> createLayerFromCaffe<LRNLayer>(LayerParams&);
+template Ptr<Layer> createLayerFromCaffe<MVNLayer>(LayerParams&);
+
+template Ptr<Layer> createLayerFromCaffe<ConcatLayer>(LayerParams&);
+template Ptr<Layer> createLayerFromCaffe<SliceLayer>(LayerParams&);
+template Ptr<Layer> createLayerFromCaffe<SplitLayer>(LayerParams&);
+
+template Ptr<Layer> createLayerFromCaffe<ReLULayer>(LayerParams&);
+template Ptr<Layer> createLayerFromCaffe<SigmoidLayer>(LayerParams&);
+template Ptr<Layer> createLayerFromCaffe<TanHLayer>(LayerParams&);
+template Ptr<Layer> createLayerFromCaffe<AbsLayer>(LayerParams&);
+template Ptr<Layer> createLayerFromCaffe<BNLLLayer>(LayerParams&);
+template Ptr<Layer> createLayerFromCaffe<PowerLayer>(LayerParams&);
+
+}
+}
--- a/modules/dnn/src/caffe/layer_loaders.hpp
+++ b/modules/dnn/src/caffe/layer_loaders.hpp
@ -0,0 +1,60 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_DNN_CAFFE_LAYER_LOADERS_HPP__
+#define __OPENCV_DNN_CAFFE_LAYER_LOADERS_HPP__
+
+#include <opencv2/dnn/all_layers.hpp>
+
+namespace cv
+{
+namespace dnn
+{
+
+//Common template for Caffe layer loaders
+template <typename PublicLayer>
+Ptr<Layer> createLayerFromCaffe(LayerParams&);
+
+Ptr<Layer> createFlattenLayerFromCaffe(LayerParams&);
+
+}
+}
+#endif
--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@ -543,6 +543,13 @@ Layer::Layer(const LayerParams &params)

 }

+void Layer::setParamsFrom(const LayerParams &params)
+{
+    blobs = params.blobs;
+    name = params.name;
+    type = params.type;
+}
+
 int Layer::inputNameToIndex(String)
 {
    return -1;
--- a/modules/dnn/src/init.cpp
+++ b/modules/dnn/src/init.cpp
@ -40,19 +40,8 @@
 //M*/

 #include "precomp.hpp"
-
-#include "layers/concat_layer.hpp"
-#include "layers/convolution_layer.hpp"
+#include "caffe/layer_loaders.hpp"
 #include "layers/blank_layer.hpp"
-#include "layers/elementwise_layers.hpp"
-#include "layers/fully_connected_layer.hpp"
-#include "layers/lrn_layer.hpp"
-#include "layers/mvn_layer.hpp"
-#include "layers/pooling_layer.hpp"
-#include "layers/reshape_layer.hpp"
-#include "layers/slice_layer.hpp"
-#include "layers/softmax_layer.hpp"
-#include "layers/split_layer.hpp"

 namespace cv
 {
@ -76,27 +65,27 @@ void initModule()
    if (init.status)
        return;

-    REG_RUNTIME_LAYER_CLASS(Slice, SliceLayer)
-    REG_RUNTIME_LAYER_CLASS(Softmax, SoftMaxLayer)
-    REG_RUNTIME_LAYER_CLASS(Split, SplitLayer)
-    REG_RUNTIME_LAYER_CLASS(Reshape, ReshapeLayer)
-    REG_STATIC_LAYER_FUNC(Flatten, createFlattenLayer)
-    REG_RUNTIME_LAYER_CLASS(Pooling, PoolingLayer)
-    REG_RUNTIME_LAYER_CLASS(MVN, MVNLayer)
-    REG_RUNTIME_LAYER_CLASS(LRN, LRNLayer)
-    REG_RUNTIME_LAYER_CLASS(InnerProduct, FullyConnectedLayer)
+    REG_RUNTIME_LAYER_FUNC(Slice,           createLayerFromCaffe<SliceLayer>);
+    REG_RUNTIME_LAYER_FUNC(Split,           createLayerFromCaffe<SplitLayer>);
+    REG_RUNTIME_LAYER_FUNC(Concat,          createLayerFromCaffe<ConcatLayer>);
+    REG_RUNTIME_LAYER_FUNC(Reshape,         createLayerFromCaffe<ReshapeLayer>);
+    REG_RUNTIME_LAYER_FUNC(Flatten,         createFlattenLayerFromCaffe);

-    REG_RUNTIME_LAYER_CLASS(ReLU, ElementWiseLayer<ReLUFunctor>)
-    REG_RUNTIME_LAYER_CLASS(TanH, ElementWiseLayer<TanHFunctor>)
-    REG_RUNTIME_LAYER_CLASS(BNLL, ElementWiseLayer<BNLLFunctor>)
-    REG_RUNTIME_LAYER_CLASS(Power, ElementWiseLayer<PowerFunctor>)
-    REG_RUNTIME_LAYER_CLASS(AbsVal, ElementWiseLayer<AbsValFunctor>)
-    REG_RUNTIME_LAYER_CLASS(Sigmoid, ElementWiseLayer<SigmoidFunctor>)
-    REG_RUNTIME_LAYER_CLASS(Dropout, BlankLayer)
+    REG_RUNTIME_LAYER_FUNC(Convolution,     createLayerFromCaffe<ConvolutionLayer>);
+    REG_RUNTIME_LAYER_FUNC(Deconvolution,   createLayerFromCaffe<DeconvolutionLayer>);
+    REG_RUNTIME_LAYER_FUNC(Pooling,         createLayerFromCaffe<PoolingLayer>);
+    REG_RUNTIME_LAYER_FUNC(LRN,             createLayerFromCaffe<LRNLayer>);
+    REG_RUNTIME_LAYER_FUNC(InnerProduct,    createLayerFromCaffe<InnerProductLayer>);
+    REG_RUNTIME_LAYER_FUNC(Softmax,         createLayerFromCaffe<SoftmaxLayer>);
+    REG_RUNTIME_LAYER_FUNC(MVN,             createLayerFromCaffe<MVNLayer>);

-    REG_RUNTIME_LAYER_CLASS(Convolution, ConvolutionLayer)
-    REG_RUNTIME_LAYER_CLASS(Deconvolution, DeConvolutionLayer)
-    REG_RUNTIME_LAYER_CLASS(Concat, ConcatLayer)
+    REG_RUNTIME_LAYER_FUNC(ReLU,            createLayerFromCaffe<ReLULayer>);
+    REG_RUNTIME_LAYER_FUNC(Sigmoid,         createLayerFromCaffe<SigmoidLayer>);
+    REG_RUNTIME_LAYER_FUNC(TanH,            createLayerFromCaffe<TanHLayer>);
+    REG_RUNTIME_LAYER_FUNC(BNLL,            createLayerFromCaffe<BNLLLayer>);
+    REG_RUNTIME_LAYER_FUNC(AbsVal,          createLayerFromCaffe<AbsLayer>);
+    REG_RUNTIME_LAYER_FUNC(Power,           createLayerFromCaffe<PowerLayer>);
+    REG_RUNTIME_LAYER_CLASS(Dropout,        BlankLayer)

    init.status = true;
 }
--- a/modules/dnn/src/layers/concat_layer.cpp
+++ b/modules/dnn/src/layers/concat_layer.cpp
@ -42,60 +42,80 @@
 #include "../precomp.hpp"
 #include "layers_common.hpp"
 #include "concat_layer.hpp"
+#include <opencv2/core/ocl.hpp>

 namespace cv
 {
 namespace dnn
 {
-    ConcatLayer::ConcatLayer(LayerParams &params) : Layer(params)
-    {
-        axis = params.get<int>("axis", 1);
-        CV_Assert(axis >= 0);
-    }

-    void ConcatLayer::allocate(const std::vector<Blob *> &inputs, std::vector<Blob> &outputs)
-    {
-        CV_Assert(inputs.size() > 0);
+ConcatLayerImpl::ConcatLayerImpl(int axis_ /*= 1*/)
+{
+    axis = axis_;
+}

-        int refType = inputs[0]->type();
-        BlobShape refShape = inputs[0]->shape();
-        CV_Assert(axis < refShape.dims());
+void ConcatLayerImpl::allocate(const std::vector<Blob *> &inputs, std::vector<Blob> &outputs)
+{
+    CV_Assert(inputs.size() > 0);

-        int axisSum = 0;
-        for (size_t i = 0; i < inputs.size(); i++)
-        {
-            BlobShape curShape = inputs[i]->shape();
+    BlobShape refShape = inputs[0]->shape();
+    axisIdx = inputs[0]->canonicalAxis(axis);

-            CV_Assert(curShape.dims() == refShape.dims() && inputs[i]->type() == refType);
-            for (int axisId = 0; axisId < refShape.dims(); axisId++)
-            {
-                if (axisId != axis && refShape[axisId] != curShape[axisId])
-                    CV_Error(Error::StsBadSize, "Inconsitent shape for ConcatLayer");
-            }
+    int axisSum = 0;
+    useOpenCL = false;
+    for (size_t i = 0; i < inputs.size(); i++)
+    {
+        BlobShape curShape = inputs[i]->shape();

-            axisSum += curShape[axis];
+        CV_Assert(curShape.dims() == refShape.dims() && inputs[i]->type() == inputs[0]->type());
+        for (int curAxis = 0; curAxis < refShape.dims(); curAxis++)
+        {
+            if (curAxis != axisIdx && refShape[curAxis] != curShape[curAxis])
+                CV_Error(Error::StsBadSize, "Inconsitent shape for ConcatLayer");
        }

-        refShape[axis] = axisSum;
-        outputs.resize(1);
-        outputs[0].create(refShape);
+        axisSum += curShape[axisIdx];
+        useOpenCL |= inputs[i]->getState() == Blob::HEAD_AT_MAT;
    }

-    void ConcatLayer::forward(std::vector<Blob *> &inputs, std::vector<Blob> &outputs)
-    {
-        const Mat& outMat = outputs[0].matRef();
-        std::vector<Range> ranges(outputs[0].dims(), Range::all());
-        int sizeStart = 0;
-        for (size_t i = 0; i < inputs.size(); i++)
-        {
-            int sizeEnd = sizeStart + inputs[i]->size(axis);
-            ranges[axis] = Range(sizeStart, sizeEnd);
+    refShape[axisIdx] = axisSum;
+    useOpenCL &= ocl::useOpenCL();
+    int allocFlags = (useOpenCL) ? Blob::ALLOC_UMAT : Blob::ALLOC_MAT;

-            Mat outSubMat = outMat(&ranges[0]);
-            inputs[i]->matRef().copyTo(outSubMat);
+    outputs.resize(1);
+    outputs[0].create(refShape, inputs[0]->type(), allocFlags);
+}

-            sizeStart = sizeEnd;
-        }
+
+void ConcatLayerImpl::forward(std::vector<Blob *> &inputs, std::vector<Blob> &outputs)
+{
+    #ifdef HAVE_OPENCL
+    if (useOpenCL)
+        forward_<UMat>(inputs, outputs);
+    else
+    #endif
+        forward_<Mat>(inputs, outputs);
+}
+
+template<typename XMat>
+void ConcatLayerImpl::forward_(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
+{
+    XMat& outMat = outputs[0].getRef<XMat>();
+    std::vector<Range> ranges(outputs[0].dims(), Range::all());
+
+    ranges[axisIdx].start = 0;
+    for (size_t i = 0; i < inputs.size(); i++)
+    {
+        ranges[axisIdx].end = ranges[axisIdx].start + inputs[i]->size(axisIdx);
+        inputs[i]->getRefConst<XMat>().copyTo(outMat(&ranges[0]));
+        ranges[axisIdx].start = ranges[axisIdx].end;
    }
 }
+
+Ptr<ConcatLayer> ConcatLayer::create(int axis)
+{
+    return Ptr<ConcatLayer>(new ConcatLayerImpl(axis));
+}
+
+}
 }
--- a/modules/dnn/src/layers/concat_layer.hpp
+++ b/modules/dnn/src/layers/concat_layer.hpp
@ -42,20 +42,29 @@
 #ifndef __OPENCV_DNN_LAYERS_CONCAT_LAYER_HPP__
 #define __OPENCV_DNN_LAYERS_CONCAT_LAYER_HPP__
 #include "../precomp.hpp"
+#include <opencv2/dnn/all_layers.hpp>

 namespace cv
 {
 namespace dnn
 {
-    class ConcatLayer : public Layer
-    {
-        int axis;
-
-    public:
-        ConcatLayer(LayerParams& params);
-        void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-        void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-    };
+
+class ConcatLayerImpl : public ConcatLayer
+{
+    bool useOpenCL;
+    int axisIdx;
+
+    template<typename XMat>
+    void forward_(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
+
+public:
+    ConcatLayerImpl(int axis_ = 1);
+
+    void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
+
+    void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
+};
+
 }
 }
 #endif
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@ -43,209 +43,295 @@
 #include <opencv2/core/ocl.hpp>
 #include "layers_common.hpp"
 #include "convolution_layer.hpp"
-#include "im2col.hpp"
+#include "op_im2col.hpp"
+#include "op_blas.hpp"
+#include <opencv2/dnn/shape_utils.hpp>
 #include <iostream>

 namespace cv
 {
 namespace dnn
 {
-    ConvolutionLayer::ConvolutionLayer(LayerParams &params) : Layer(params)
-    {
-        getKernelParams(params, kerH, kerW, padH, padW, strideH, strideW);
-
-        numOutput = params.get<int>("num_output");
-        bias = params.get<bool>("bias_term", true);
-        group = params.get<int>("group", 1);
-        CV_Assert(numOutput % group == 0);
-
-        CV_Assert(!bias || blobs.size() == 2);
-        CV_Assert( bias || blobs.size() == 1);

-        const Blob &wgtBlob = blobs[0];
-        CV_Assert(wgtBlob.dims() == 4 && wgtBlob.cols() == kerW && wgtBlob.rows() == kerH);
+ConvolutionLayerImpl::ConvolutionLayerImpl()
+{
+    tryUseOpenCL = true;
+    numOutput = -1;
+    group = -1;

-        if (bias)
+    #if HAVE_CBLAS
+        if (getBlasThreads() != cv::getThreadNum())
        {
-            Blob &biasBlob = blobs[1];
-            CV_Assert(biasBlob.total() == (size_t)numOutput);
+            setBlasThreads(cv::getThreadNum());
        }
+    #endif
+}

-        //TBD
-        useOpenCL = params.has("use_opencl");
-    }
+void ConvolutionLayerImpl::init()
+{
+    CV_Assert(1 <= blobs.size() && blobs.size() <= 2);

-    void ConvolutionLayer::allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
-    {
-        CV_Assert(inputs.size() > 0);
+    bias = (blobs.size() >= 2);
+    numOutput = blobs[0].num();

-        const Blob &inpBlob = *inputs[0];
-        CV_Assert(inpBlob.dims() == 4 && inpBlob.type() == CV_32F);
-        computeInpOutShape(inpBlob);
+    CV_Assert(blobs[0].dims() == 4 && blobs[0].cols() == kernel.width && blobs[0].rows() == kernel.height);
+    CV_Assert(!bias || blobs[1].total() == (size_t)blobs[0].num());

-        CV_Assert(inpCn % group == 0 && outCn % group == 0);
-        CV_Assert(blobs[0].num() == outCn && blobs[0].channels() == inpCn / group);
+    useOpenCL = ocl::useOpenCL() && tryUseOpenCL;
+}

-        outGroupCn = outCn / group;
-        inpGroupCn = inpCn / group;
-        ksize = inpGroupCn * kerH * kerW;
+void ConvolutionLayerImpl::allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
+{
+    init();

-        outputs.resize(inputs.size());
-        for (size_t i = 0; i < inputs.size(); i++)
-        {
-            CV_Assert(inputs[i]->type() == inpBlob.type());
-            CV_Assert(inputs[i]->dims() == 4 && inputs[i]->channels() == inpBlob.channels());
-            CV_Assert(inputs[i]->rows() == inpBlob.rows() && inputs[i]->cols() == inpBlob.cols());
+    CV_Assert(inputs.size() > 0);
+    const Blob &input = *inputs[0];
+    CV_Assert(input.dims() == 4 && (input.type() == CV_32F || input.type() == CV_64F));
+    computeInpOutShape(input);

-            outputs[i].create(BlobShape(inputs[i]->num(), topCn, topH, topW));
-        }
+    group = inpCn / blobs[0].channels();
+    CV_Assert(inpCn % group == 0 && outCn % group == 0);
+    CV_Assert(blobs[0].num() == outCn && blobs[0].channels() == inpCn / group);

-        if (!is1x1())
-            colMat.create(ksize, outH * outW, inpBlob.type());
+    outGroupCn = outCn / group;
+    inpGroupCn = inpCn / group;
+    ksize = inpGroupCn * kernel.height * kernel.width;

-        if (bias)
-            biasOnesMat = Mat::ones(1, topH * topW, inpBlob.type());
+    for (size_t i = 0; i < inputs.size(); i++)
+    {
+        CV_Assert(inputs[i]->type() == input.type());
+        CV_Assert(inputs[i]->dims() == 4 && inputs[i]->channels() == input.channels());
+        CV_Assert(inputs[i]->rows() == input.rows() && inputs[i]->cols() == input.cols());
    }

-    inline bool ConvolutionLayer::is1x1() const
+    int allocFlags = useOpenCL ? Blob::ALLOC_UMAT : Blob::ALLOC_MAT;
+
+    if (!is1x1())
    {
-        return (kerH == 1 && kerW == 1);
+        colBlob.create(Shape(ksize, outH * outW), input.type(), allocFlags);
    }

-    void ConvolutionLayer::forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
+    if (bias)
    {
-        Blob &wgtBlob = blobs[0];
+        biasOnesBlob.create(Shape(1, topH * topW), input.type(), allocFlags);
+        biasOnesBlob.setTo(1);
+    }

-        for (size_t ii = 0; ii < outputs.size(); ii++)
-        {
-            Blob &inpBlob = *inputs[ii];
-            Blob &outBlob = outputs[ii];
+    outputs.resize(inputs.size());
+    for (size_t i = 0; i < inputs.size(); i++)
+    {
+        outputs[i].create(Shape(inputs[i]->num(), topCn, topH, topW), input.type(), allocFlags);
+    }
+}
+
+bool ConvolutionLayerImpl::is1x1() const
+{
+    return (kernel.height == 1 && kernel.width == 1) &&
+           (stride.height == 1 && stride.width == 1);
+}
+
+template<typename XMat>
+void ConvolutionLayerImpl::forward_(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
+{
+    XMat weightsMat = reshaped(blobs[0].getRefConst<XMat>(), Shape(outCn, ksize));
+    XMat biasesMat  = reshaped(blobs[1].getRefConst<XMat>(), Shape(outCn, 1));

-            for (int n = 0; n < inpBlob.num(); n++)
+    for (size_t ii = 0; ii < outputs.size(); ii++)
+    {
+        int numImg = inputs[ii]->size(0);
+        XMat inpMat = inputs[ii]->getRefConst<XMat>();
+        XMat outMat = reshaped(outputs[ii].getRef<XMat>(), Shape(numImg*group*outGroupCn, outH*outW));
+
+        for (int n = 0; n < numImg; n++)
+        {
+            for (int g = 0; g < group; g++)
            {
-                for (int g = 0; g < group; g++)
-                {
-                    im2col(inpBlob, n, g);
+                XMat colMat, curInp = slice(inpMat, n, _Range(g * inpGroupCn, inpGroupCn));
+                im2col(curInp, colMat);
+
+                _Range kerRange(g * outGroupCn, outGroupCn);
+                XMat kerMat = weightsMat.rowRange(kerRange);

-                    Mat kerMat(outGroupCn, ksize, wgtBlob.type(), wgtBlob.ptr(g*outGroupCn));
-                    Mat dstMat(outGroupCn, outH*outW, outBlob.type(), outBlob.ptr(n, g*outGroupCn));
+                _Range outRange((g + n * group) * outGroupCn, outGroupCn);
+                XMat dstMat = outMat.rowRange(outRange);

-                    cv::gemm(kerMat, colMat, 1, noArray(), 0, dstMat);
+                dnn::gemm(kerMat, colMat, 1, dstMat, 0);

-                    if (bias)
-                    {
-                        float *biasPtr = blobs[1].ptrf() + g*outGroupCn;
-                        Mat biasMat(outGroupCn, 1, CV_32F, biasPtr);
-                        cv::gemm(biasMat, biasOnesMat, 1, dstMat, 1, dstMat);
-                    }
+                if (bias)
+                {
+                    dnn::gemm(biasesMat.rowRange(kerRange), biasOnesBlob.getRefConst<XMat>(), 1, dstMat, 1);
                }
            }
        }
    }
+}

-    void ConvolutionLayer::im2col(Blob &inpBlob, int imNum, int cnGroup)
-    {
-        uchar *srcPtr = inpBlob.ptr(imNum, cnGroup*inpGroupCn);
-
-        if (is1x1())
-        {
-            colMat = Mat(ksize, inpBlob.rows()*inpBlob.cols(), inpBlob.type(), srcPtr);
-            return;
-        }
+void ConvolutionLayerImpl::forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
+{
+    if (!useOpenCL)
+        forward_<Mat>(inputs, outputs);
+    else
+        forward_<UMat>(inputs, outputs);
+}

+void ConvolutionLayerImpl::im2col(const UMat &srcImg, UMat &dstCol)
+{
+    if (is1x1())
+    {
+        dstCol = reshaped(srcImg, Shape(ksize, outH*outW));
+        return;
+    }
 #ifdef HAVE_OPENCL
-        if (useOpenCL && ocl::useOpenCL() && inpBlob.type() == CV_32F && !is1x1())
-        {
-            std::vector<Range> ranges(4, Range::all());
-            ranges[0] = Range(imNum, imNum+1);
-            ranges[1] = Range(cnGroup*inpGroupCn, (cnGroup + 1)*inpGroupCn);
-
-            UMat src = inpBlob.matRef()(&ranges[0]).getUMat(ACCESS_READ);
-            UMat dst(colMat.size(), colMat.type());
-            im2col_ocl(src, inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, dst);
-            dst.copyTo(colMat);
-            return;
-        }
-#endif // HAVE_OPENCL
+    CV_Assert(im2col_ocl(srcImg, inpGroupCn, inpH, inpW, kernel.height, kernel.width, pad.height, pad.width, stride.height, stride.width, this->colBlob.umatRef()));
+    dstCol = this->colBlob.umatRefConst();
+#else
+    CV_Error(Error::StsInternal, "");
+    dstCol = srcImg; //supress warning
+#endif
+}

-        if (inpBlob.type() == CV_32F)
-            im2col_cpu((float *)srcPtr, inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, (float *)colMat.ptr());
-        if (inpBlob.type() == CV_64F)
-            im2col_cpu((double*)srcPtr, inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, (double*)colMat.ptr());
+void ConvolutionLayerImpl::im2col(const Mat &srcImg, Mat &dstCol)
+{
+    if (is1x1())
+    {
+        dstCol = reshaped(srcImg, Shape(ksize, outH*outW));
+        return;
    }

-    void ConvolutionLayer::computeInpOutShape(const Blob &inpBlob)
-    {
-        inpH = inpBlob.rows();
-        inpW = inpBlob.cols();
-        inpCn = inpBlob.channels();
+    Mat &colMat = colBlob.matRef();
+    if (srcImg.type() == CV_32F)
+        im2col_CpuPBody<float>::run(srcImg.ptr<float>(), inpGroupCn, inpH, inpW, kernel.height, kernel.width, pad.height, pad.width, stride.height, stride.width, colMat.ptr<float>());
+    if (srcImg.type() == CV_64F)
+        im2col_CpuPBody<double>::run(srcImg.ptr<double>(), inpGroupCn, inpH, inpW, kernel.height, kernel.width, pad.height, pad.width, stride.height, stride.width, colMat.ptr<double>());

-        outH = (inpH + 2 * padH - kerH) / strideH + 1;
-        outW = (inpW + 2 * padW - kerW) / strideW + 1;
-        outCn = numOutput;
+    dstCol = colMat;
+}

-        topH = outH; topW = outW; topCn = outCn;
-    }
+void ConvolutionLayerImpl::computeInpOutShape(const Blob &input)
+{
+    inpH = input.rows();
+    inpW = input.cols();
+    inpCn = input.channels();

-    DeConvolutionLayer::DeConvolutionLayer(LayerParams &params)
-        : ConvolutionLayer(params) {}
+    outH = (inpH + 2 * pad.height - kernel.height) / stride.height + 1;
+    outW = (inpW + 2 * pad.width - kernel.width) / stride.width + 1;
+    outCn = numOutput;

-    void DeConvolutionLayer::computeInpOutShape(const Blob &inpBlob)
-    {
-        outH = inpBlob.rows();
-        outW = inpBlob.cols();
-        outCn = inpBlob.channels();
+    topH = outH; topW = outW; topCn = outCn;
+}

-        inpH = strideH * (outH - 1) + kerH - 2 * padH;
-        inpW = strideW * (outW - 1) + kerW - 2 * padW;
-        inpCn = numOutput;
+//Deconvolution

-        topH = inpH; topW = inpW; topCn = inpCn;
-    }
+DeConvolutionLayerImpl::DeConvolutionLayerImpl()
+{
+
+}
+
+void DeConvolutionLayerImpl::computeInpOutShape(const Blob &inpBlob)
+{
+    outH = inpBlob.rows();
+    outW = inpBlob.cols();
+    outCn = inpBlob.channels();
+
+    inpH = stride.height * (outH - 1) + kernel.height - 2 * pad.height;
+    inpW = stride.width * (outW - 1) + kernel.width - 2 * pad.width;
+    inpCn = numOutput;
+
+    topH = inpH; topW = inpW; topCn = inpCn;
+}
+
+void DeConvolutionLayerImpl::forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
+{
+    if (!useOpenCL)
+        forward_<Mat>(inputs, outputs);
+    else
+        forward_<UMat>(inputs, outputs);
+}

-    void DeConvolutionLayer::forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
+template<typename XMat>
+void DeConvolutionLayerImpl::forward_(std::vector<Blob *> &inputs, std::vector<Blob> &outputs)
+{
+    XMat weightsMat = reshaped(blobs[0].getRefConst<XMat>(), Shape(outCn, ksize));
+    XMat biasesMat  = reshaped(blobs[1].getRefConst<XMat>(), Shape(outCn, 1));
+
+    for (size_t ii = 0; ii < outputs.size(); ii++)
    {
-        Blob &wghtBlob = blobs[0];
+        int numImg = inputs[ii]->size(0);
+        XMat convBlob = reshaped(inputs[ii]->getRefConst<XMat>(), Shape(numImg*outCn, outH*outW));
+        XMat decnBlob = reshaped(outputs[ii].getRef<XMat>(), Shape(numImg*inpCn, inpH*inpW));

-        for (size_t ii = 0; ii < outputs.size(); ii++)
+        for (int n = 0; n < numImg; n++)
        {
-            Blob &convBlob = *inputs[ii];
-            Blob &decnBlob = outputs[ii];
-
-            for (int n = 0; n < convBlob.num(); n++)
+            for (int g = 0; g < group; g++)
            {
-                for (int g = 0; g < group; g++)
-                {
-                    Mat dstMat(inpGroupCn, inpH*inpW, decnBlob.type(), decnBlob.ptr(n, g*inpGroupCn));
+                XMat dstMat = decnBlob.rowRange(_Range((g + n * group) * inpGroupCn, inpGroupCn));
+                XMat &colMat = (is1x1()) ? dstMat : colBlob.getRef<XMat>();

-                    if (is1x1())
-                        colMat = dstMat;
+                XMat convMat = convBlob.rowRange(_Range((g + n * group) * outGroupCn, outGroupCn));
+                XMat wghtMat = weightsMat.rowRange(_Range(g * outGroupCn, outGroupCn));

-                    Mat convMat(outGroupCn, outH*outW, convBlob.type(), convBlob.ptr(n, g*outGroupCn));
-                    Mat wghtMat(outGroupCn, ksize, wghtBlob.type(), wghtBlob.ptr(g*outGroupCn));
-                    cv::gemm(wghtMat, convMat, 1, noArray(), 0, colMat, GEMM_1_T);
+                dnn::gemm(wghtMat, convMat, 1, colMat, 0, GEMM_1_T);

-                    col2im(dstMat);
+                if (!is1x1())
+                    col2im(colMat, dstMat);

-                    if (bias)
-                    {
-                        float *biasPtr = blobs[1].ptrf() + g*inpGroupCn;
-                        Mat biasMat(inpGroupCn, 1, CV_32F, biasPtr);
-                        cv::gemm(biasMat, biasOnesMat, 1, dstMat, 1, dstMat);
-                    }
+                if (bias)
+                {
+                    XMat curBiasMat = biasesMat.rowRange(_Range(g * outGroupCn, outGroupCn));
+                    dnn::gemm(curBiasMat, biasOnesBlob.getRefConst<XMat>(), 1, dstMat, 1);
                }
            }
        }
    }
+}

-    void DeConvolutionLayer::col2im(Mat &dstMat)
+void DeConvolutionLayerImpl::col2im(const Mat &colMat, Mat &dstImg)
+{
+    if (is1x1())
    {
-        if (is1x1()) return;
+        dstImg = colMat;
+        return;
+    }
+    if (dstImg.type() == CV_32F)
+        col2im_CpuPBody<float>::run(colMat.ptr<float>(), inpGroupCn, inpH, inpW, kernel.height, kernel.width, pad.height, pad.width, stride.height, stride.width, dstImg.ptr<float>());
+    if (dstImg.type() == CV_64F)
+        col2im_CpuPBody<double>::run(colMat.ptr<double>(), inpGroupCn, inpH, inpW, kernel.height, kernel.width, pad.height, pad.width, stride.height, stride.width, dstImg.ptr<double>());
+}

-        if (dstMat.type() == CV_32F)
-            col2im_cpu((float*)colMat.ptr(), inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, (float*)dstMat.ptr());
-        if (dstMat.type() == CV_64F)
-            col2im_cpu((double*)colMat.ptr(), inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, (double*)dstMat.ptr());
+void DeConvolutionLayerImpl::col2im(const UMat &colMat, UMat &dstImg)
+{
+    if (is1x1())
+    {
+        dstImg = colMat;
+        return;
    }
+#ifdef HAVE_OPENCL
+    CV_Assert(col2im_ocl(colMat, inpGroupCn, inpH, inpW, kernel.height, kernel.width, pad.height, pad.width, stride.height, stride.width, dstImg));
+#else
+    CV_Error(Error::StsInternal, "");
+    dstImg = colMat;
+#endif
+}
+
+//Initializers
+
+Ptr<BaseConvolutionLayer> ConvolutionLayer::create(Size kernel, Size stride, Size pad)
+{
+    ConvolutionLayerImpl *l = new ConvolutionLayerImpl();
+    l->kernel = kernel;
+    l->pad = pad;
+    l->stride = stride;
+    return Ptr<BaseConvolutionLayer>(l);
+}
+
+Ptr<BaseConvolutionLayer> DeconvolutionLayer::create(Size kernel, Size stride, Size pad)
+{
+    DeConvolutionLayerImpl *l = new DeConvolutionLayerImpl();
+    l->kernel = kernel;
+    l->pad = pad;
+    l->stride = stride;
+    return Ptr<BaseConvolutionLayer>(l);
+}
+
 }
 }
--- a/modules/dnn/src/layers/convolution_layer.hpp
+++ b/modules/dnn/src/layers/convolution_layer.hpp
@ -42,51 +42,65 @@
 #ifndef __OPENCV_DNN_LAYERS_CONVOLUTION_LAYER_HPP__
 #define __OPENCV_DNN_LAYERS_CONVOLUTION_LAYER_HPP__
 #include "../precomp.hpp"
+#include <opencv2/dnn/all_layers.hpp>

 namespace cv
 {
 namespace dnn
 {
-    //TODO: simultaneously convolution and bias addition for cache optimization
-    class ConvolutionLayer : public Layer
-    {
-    protected:
-        bool bias;
-        int numOutput, group;
-        int padH, padW;
-        int kerH, kerW;
-        int strideH, strideW;

-        int inpH, inpW, inpCn;
-        int outH, outW, outCn;
-        int topH, topW, topCn; //switched between inp/out on deconv/conv
-        int inpGroupCn, outGroupCn;
-        int ksize;
+//TODO: simultaneously convolution and bias addition for cache optimization
+class ConvolutionLayerImpl : public ConvolutionLayer
+{
+public:
+
+    ConvolutionLayerImpl();
+    virtual void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
+    virtual void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
+    virtual void init();
+
+protected:
+    int numOutput, group;
+    int inpH, inpW, inpCn;
+    int outH, outW, outCn;
+    int topH, topW, topCn; //switched between inp/out on deconv/conv
+    int inpGroupCn, outGroupCn;
+    int ksize;
+
+    bool bias;
+    bool tryUseOpenCL, useOpenCL;
+
+    Blob colBlob, biasOnesBlob;
+
+    bool is1x1() const;
+    virtual void computeInpOutShape(const Blob &inpBlob);
+
+    template<typename XMat>
+    void forward_(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
+    void im2col(const  Mat &srcImg,  Mat &dstCol);
+    void im2col(const UMat &srcImg, UMat &dstCol);
+};
+
+class DeConvolutionLayerImpl : public ConvolutionLayerImpl
+{
+public:
+    DeConvolutionLayerImpl();
+    virtual void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);

-        bool useOpenCL;
-        Mat colMat, biasOnesMat;
+protected:

-        inline bool is1x1() const;
-        virtual void computeInpOutShape(const Blob &inpBlob);
-        void im2col(Blob &inpBlob, int imNum, int cnGroup);
+    virtual void computeInpOutShape(const Blob &inpBlob);

-    public:
-        ConvolutionLayer() {}
-        ConvolutionLayer(LayerParams &params);
-        void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-        void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-    };
+    template<typename XMat>
+    void forward_(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
+    void col2im(const  Mat &colMat, Mat  &dstImg);
+    void col2im(const UMat &colMat, UMat &dstImg);
+};

-    class DeConvolutionLayer : public ConvolutionLayer
-    {
-    protected:
-        void computeInpOutShape(const Blob &inpBlob);
-        void col2im(Mat &dstMat);
+//Importers
+Ptr<Layer> createConvolutionLayerFromCaffe(LayerParams &params);
+Ptr<Layer> createDeconvolutionLayerFromCaffe(LayerParams &params);

-    public:
-        DeConvolutionLayer(LayerParams &params);
-        void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-    };
 }
 }
 #endif
--- a/modules/dnn/src/layers/elementwise_layers.cpp
+++ b/modules/dnn/src/layers/elementwise_layers.cpp
@ -0,0 +1,46 @@
+#include "../precomp.hpp"
+#include "elementwise_layers.hpp"
+
+namespace cv
+{
+namespace dnn
+{
+
+#define ACTIVATION_CREATOR_FOR(_Layer, _Functor, ...) \
+Ptr<_Layer> _Layer::create() { \
+    return return Ptr<_Layer>( new ElementWiseLayer<_Functor>(_Functor()) ); }
+
+
+Ptr<ReLULayer> ReLULayer::create(double negativeSlope)
+{
+    return Ptr<ReLULayer>(new ElementWiseLayer<ReLUFunctor>(ReLUFunctor(negativeSlope)));
+}
+
+Ptr<TanHLayer> TanHLayer::create()
+{
+    return Ptr<TanHLayer>(new ElementWiseLayer<TanHFunctor>());
+}
+
+Ptr<SigmoidLayer> SigmoidLayer::create()
+{
+    return Ptr<SigmoidLayer>(new ElementWiseLayer<SigmoidFunctor>());
+}
+
+Ptr<AbsLayer> AbsLayer::create()
+{
+    return Ptr<AbsLayer>(new ElementWiseLayer<AbsValFunctor>());
+}
+
+Ptr<BNLLLayer> BNLLLayer::create()
+{
+    return Ptr<BNLLLayer>(new ElementWiseLayer<BNLLFunctor>());
+}
+
+Ptr<PowerLayer> PowerLayer::create(double power /*= 1*/, double scale /*= 1*/, double shift /*= 0*/)
+{
+    const PowerFunctor f(power, scale, shift);
+    return Ptr<PowerLayer>(new ElementWiseLayer<PowerFunctor>(f));
+}
+
+}
+}
--- a/modules/dnn/src/layers/elementwise_layers.hpp
+++ b/modules/dnn/src/layers/elementwise_layers.hpp
@ -44,6 +44,11 @@
 #include "../precomp.hpp"
 #include "layers_common.hpp"
 #include <cmath>
+#include <opencv2/dnn/all_layers.hpp>
+#include <opencv2/core/ocl.hpp>
+#ifdef HAVE_OPENCL
+#include "modules/dnn/opencl_kernels_dnn.hpp"
+#endif

 namespace cv
 {
@ -55,130 +60,259 @@ using std::exp;
 using std::tanh;
 using std::pow;

-    template<typename Func>
-    class ElementWiseLayer : public Layer
+template<typename Func>
+class ElementWiseLayer : public Func::Layer
+{
+    bool useOpenCL;
+    Func func;
+
+    template<typename Dtype>
+    class PBody : public cv::ParallelLoopBody
    {
-        Func func;
+        Func &func;
+        Dtype *data;
    public:

-        ElementWiseLayer(LayerParams &_params) : func(_params) {}
-
-        void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
-        {
-            outputs.resize(inputs.size());
-            for (size_t i = 0; i < inputs.size(); i++)
-                outputs[i].shareFrom(*inputs[i]); //no data copy
-        }
+        PBody(Mat &mat, Func &func_) :
+            func(func_), data(mat.ptr<Dtype>())
+        {}

-        void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
+        void operator()(const Range &r) const
        {
-            for (size_t i = 0; i < inputs.size(); i++)
-            {
-                CV_Assert(inputs[i]->ptr() == outputs[i].ptr() && inputs[i]->type() == outputs[i].type());
-
-                size_t size = outputs[i].total();
-
-                if (outputs[i].type() == CV_32F)
-                {
-                    float *data = outputs[i].ptrf();
-                    for (size_t j = 0; j < size; j++)
-                        data[j] = func(data[j]);
-                }
-                else if (outputs[i].type() == CV_64F)
-                {
-                    double *data = outputs[i].ptr<double>();
-                    for (size_t j = 0; j < size; j++)
-                        data[j] = func(data[j]);
-                }
-                else
-                {
-                    CV_Error(Error::StsNotImplemented, "Only CV_32F and CV_64F blobs are supported");
-                }
-            }
+            for (int i = r.start; i < r.end; i++)
+                data[i] = func(data[i]);
        }
    };

+public:

-    struct ReLUFunctor
+    ElementWiseLayer() {}
+    ElementWiseLayer(const Func &f) : func(f) {}
+
+    void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
    {
-        float negative_slope;
+        useOpenCL = ocl::useOpenCL();

-        ReLUFunctor(LayerParams &params)
+        outputs.resize(inputs.size());
+        for (size_t i = 0; i < inputs.size(); i++)
        {
-            if (params.has("negative_slope"))
-                negative_slope = params.get<float>("negative_slope");
+            outputs[i].shareFrom(*inputs[i]); //no data copy
+
+            //hotfix: shareFrom doesn't provide properly Mat/UMat switching
+            if (useOpenCL)
+                outputs[i].umatRef() = inputs[i]->umatRefConst();
            else
-                negative_slope = 0.f;
+                outputs[i].matRef() = inputs[i]->matRefConst();
        }
+    }

-        template<typename TFloat>
-        inline TFloat operator()(TFloat x)
-        {
-            return (x >= (TFloat)0) ? x : negative_slope * x;
-        }
-    };
+    void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
+    {
+        #ifdef HAVE_OPENCL
+        if (useOpenCL)
+            forwardOCL(inputs, outputs);
+        else
+        #endif
+            forwardCPU(inputs, outputs);
+    }

-    struct TanHFunctor
+    #ifdef HAVE_OPENCL
+    void forwardOCL(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
    {
-        TanHFunctor(LayerParams&) {}
+        size_t wgSize = ocl::Device::getDefault().maxWorkGroupSize();

-        template<typename TFloat>
-        inline TFloat operator()(TFloat x)
+        for (size_t i = 0; i < inputs.size(); i++)
        {
-            return tanh(x);
+            const UMat &src = inputs[i]->umatRefConst();
+            UMat &dst = outputs[i].umatRef();
+            CV_Assert(src.isContinuous() && dst.isContinuous() && !src.offset && !dst.offset);
+
+            ocl::Kernel ker;
+            CV_Assert(func.initKernel(ker, src));
+            ker.set(0, (int)src.total());
+            ker.set(1, ocl::KernelArg::PtrReadOnly(src));
+            ker.set(2, ocl::KernelArg::PtrWriteOnly(dst));
+
+            size_t gSize = src.total();
+            CV_Assert(ker.run(1, &gSize, &wgSize, true));
        }
-    };
+    }
+    #endif

-    struct SigmoidFunctor
+    void forwardCPU(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
    {
-        SigmoidFunctor(LayerParams&) {}
-
-        template<typename TFloat>
-        inline TFloat operator()(TFloat x)
+        for (size_t i = 0; i < inputs.size(); i++)
        {
-            return (TFloat)1 / ((TFloat)1 + exp(-x));
+            const Mat &src = inputs[i]->matRefConst();
+            Mat &dst = outputs[i].matRef();
+            CV_Assert(src.ptr() == dst.ptr() && src.isContinuous());
+
+            Range sizeRange = Range(0, dst.total());
+            if (dst.type() == CV_32F)
+            {
+                cv::parallel_for_(sizeRange, PBody<float>(dst, func));
+            }
+            else if (dst.type() == CV_64F)
+            {
+                cv::parallel_for_(sizeRange, PBody<double>(dst, func));
+            }
+            else
+            {
+                CV_Error(Error::StsNotImplemented, "Only CV_32F and CV_64F blobs are supported");
+            }
        }
-    };
+    }
+};

-    struct AbsValFunctor
+#ifdef HAVE_OPENCL
+static String oclGetTMacro(const UMat &m)
+{
+    return String("-DT=") + ocl::typeToStr(m.type()) + String(" ");
+}
+#endif
+
+struct ReLUFunctor
+{
+    typedef ReLULayer Layer;
+
+    double slope;
+
+    ReLUFunctor(double slope_)
+        : slope(slope_) {}
+
+    template<typename TFloat>
+    inline TFloat operator()(TFloat x) const
    {
-        AbsValFunctor(LayerParams&) {}
+        return (x >= (TFloat)0) ? x : (TFloat)slope * x;
+    }

-        template<typename TFloat>
-        inline TFloat operator()(TFloat x)
-        {
-            return abs(x);
-        }
-    };
+    #ifdef HAVE_OPENCL
+    bool initKernel(ocl::Kernel &ker, const UMat &src) const
+    {
+        const char *buildoptSlope = (slope == 0) ? "-DRELU_NO_SLOPE" : "";
+        String buildopt = oclGetTMacro(src) + buildoptSlope;

-    struct PowerFunctor
+        if (!ker.create("ReLUForward", ocl::dnn::activations_oclsrc, buildopt))
+            return false;
+
+        if (slope != 0)
+            ker.set(3, (float)slope);
+
+        return true;
+    }
+    #endif
+};
+
+struct TanHFunctor
+{
+    typedef TanHLayer Layer;
+
+    template<typename TFloat>
+    inline TFloat operator()(TFloat x) const
    {
-        float power, scale, shift;
+        return tanh(x);
+    }

-        PowerFunctor(LayerParams &params)
-        {
-            power = params.get<float>("power", 1.0f);
-            scale = params.get<float>("scale", 1.0f);
-            shift = params.get<float>("shift", 0.0f);
-        }
+    #ifdef HAVE_OPENCL
+    bool initKernel(ocl::Kernel &ker, const UMat &src) const
+    {
+        if (!ker.create("TanHForward", ocl::dnn::activations_oclsrc, oclGetTMacro(src)))
+            return false;
+        return true;
+    }
+    #endif
+};

-        template<typename TFloat>
-        inline TFloat operator()(TFloat x)
-        {
-            return pow((TFloat)shift + (TFloat)scale * x, (TFloat)power);
-        }
-    };
+struct SigmoidFunctor
+{
+    typedef SigmoidLayer Layer;

-    struct BNLLFunctor
+    template<typename TFloat>
+    inline TFloat operator()(TFloat x) const
    {
-        BNLLFunctor(LayerParams&) {}
+        return (TFloat)1 / ((TFloat)1 + exp(-x));
+    }
+
+    #ifdef HAVE_OPENCL
+    bool initKernel(ocl::Kernel &ker, const UMat &src) const
+    {
+        if (!ker.create("SigmoidForward", ocl::dnn::activations_oclsrc, oclGetTMacro(src)))
+            return false;
+        return true;
+    }
+    #endif
+};
+
+struct AbsValFunctor
+{
+    typedef AbsLayer Layer;
+
+    template<typename TFloat>
+    inline TFloat operator()(TFloat x) const
+    {
+        return abs(x);
+    }
+
+    #ifdef HAVE_OPENCL
+    bool initKernel(ocl::Kernel &ker, const UMat &src) const
+    {
+        if (!ker.create("AbsValForward", ocl::dnn::activations_oclsrc, oclGetTMacro(src)))
+            return false;
+        return true;
+    }
+    #endif
+};
+
+struct BNLLFunctor
+{
+    typedef BNLLLayer Layer;
+
+    template<typename TFloat>
+    inline TFloat operator()(TFloat x) const
+    {
+        return log((TFloat)1 + exp(-abs(x)));
+    }
+
+    #ifdef HAVE_OPENCL
+    bool initKernel(ocl::Kernel &ker, const UMat &src) const
+    {
+        if (!ker.create("BNLLForward", ocl::dnn::activations_oclsrc, oclGetTMacro(src)))
+            return false;
+        return true;
+    }
+    #endif
+};
+
+struct PowerFunctor
+{
+    typedef PowerLayer Layer;
+
+    double power, scale, shift;
+
+    PowerFunctor(double power_, double scale_ = 1, double shift_ = 0)
+        : power(power_), scale(scale_), shift(shift_) {}
+
+    template<typename TFloat>
+    inline TFloat operator()(TFloat x) const
+    {
+        return pow((TFloat)shift + (TFloat)scale * x, (TFloat)power);
+    }
+
+    #ifdef HAVE_OPENCL
+    bool initKernel(ocl::Kernel &ker, const UMat &src) const
+    {
+        if (!ker.create("PowForward", ocl::dnn::activations_oclsrc, oclGetTMacro(src)))
+            return false;
+
+        ker.set(3, (float)power);
+        ker.set(4, (float)scale);
+        ker.set(5, (float)shift);
+
+        return true;
+    }
+    #endif
+};

-        template<typename TFloat>
-        inline TFloat operator()(TFloat x)
-        {
-            return log((TFloat)1 + exp(-abs(x)));
-        }
-    };
 }
 }
 #endif
--- a/modules/dnn/src/layers/fully_connected_layer.cpp
+++ b/modules/dnn/src/layers/fully_connected_layer.cpp
@ -42,73 +42,88 @@
 #include "../precomp.hpp"
 #include "layers_common.hpp"
 #include "fully_connected_layer.hpp"
+#include "op_blas.hpp"
+#include <opencv2/dnn/shape_utils.hpp>
+#include <opencv2/core/ocl.hpp>

 namespace cv
 {
 namespace dnn
 {
-    FullyConnectedLayer::FullyConnectedLayer(LayerParams &params) : Layer(params)
-    {
-        numOutputs = params.get<int>("num_output");
-        bias = params.get<bool>("bias_term", true);
-        axis_ = params.get<int>("axis", 1);

-        CV_Assert(blobs.size() == (bias ? 2U : 1U));
-        CV_Assert(blobs[0].dims() >= 2 && blobs[0].total() >= (size_t)numOutputs);
-        CV_Assert(!bias || blobs[1].total() == (size_t)numOutputs);
-    }
+FullyConnectedLayerImpl::FullyConnectedLayerImpl(int axis_)
+{
+    axis = axis_;
+}

-    void FullyConnectedLayer::allocate(const std::vector<Blob*> &input, std::vector<Blob> &output)
-    {
-        CV_Assert(input.size() > 0);
+void FullyConnectedLayerImpl::allocate(const std::vector<Blob*> &input, std::vector<Blob> &output)
+{
+    CV_Assert(input.size() > 0);
+    CV_Assert(1 <= blobs.size() && blobs.size() <= 2);
+    CV_Assert(blobs[0].dims() == 2);

-        axis = input[0]->canonicalAxis(axis_);
-        innerSize = (int)input[0]->total(axis);
+    bias = (blobs.size() >= 1);
+    axisCan = input[0]->canonicalAxis(axis);
+    dtype = input[0]->type();
+    numOutput = blobs[0].size(0);
+    innerSize = blobs[0].size(1);
+    outerSize = input[0]->total(0, axisCan);

-        CV_Assert((size_t)innerSize * (size_t)numOutputs == blobs[0].total());
-        CV_Assert(blobs[0].size(-2) == numOutputs && blobs[0].size(-1) == innerSize);
+    CV_Assert((size_t)innerSize == input[0]->total(axisCan));
+    CV_Assert(!bias || (size_t)numOutput == blobs[1].total());

-        output.resize(input.size());
-        for (size_t i = 0; i < input.size(); i++)
-        {
-            if (i != 0)
-                CV_Assert(input[i]->equalShape(*input[0]));
+    useOpenCL = ocl::useOpenCL();
+    int allocFlags = useOpenCL ? Blob::ALLOC_UMAT : Blob::ALLOC_UMAT;

-            this->reshape(*input[i], output[i]);
-        }
-    }
+    biasOnesBlob.create(Shape(outerSize, 1), dtype, allocFlags);
+    biasOnesBlob.setTo(1);

-    void FullyConnectedLayer::reshape(const Blob &inp, Blob &out)
+    output.resize(input.size());
+    for (size_t i = 0; i < input.size(); i++)
    {
-        BlobShape inpShape = inp.shape();
-        BlobShape outShape(axis+1, inpShape.ptr());
-        outShape[axis] = numOutputs;
+        CV_Assert(i == 0 || (input[i]->equalShape(*input[0]) && input[i]->type() == dtype));
+        Shape outShape = input[i]->shape().slice(0, axis) + Shape(numOutput);
+        output[i].create(outShape, dtype, allocFlags);
+    }
+}

-        out.create(outShape, inp.type());
+void FullyConnectedLayerImpl::forward(std::vector<Blob*> &input, std::vector<Blob> &output)
+{
+    #ifdef HAVE_OPENCL
+    if (useOpenCL)
+        forward_<UMat>(input, output);
+    else
+    #endif
+        forward_<Mat>(input, output);
+}
+
+template<typename XMat>
+void FullyConnectedLayerImpl::forward_(std::vector<Blob *> &input, std::vector<Blob> &output)
+{
+    const XMat &weight = blobs[0].getRefConst<XMat>();
+    const XMat *biasMat = NULL, *biasOnesMat = NULL;
+    if (bias)
+    {
+        biasOnesMat = &biasOnesBlob.getRefConst<XMat>();
+        biasMat = &blobs[1].getRefConst<XMat>();
    }

-    void FullyConnectedLayer::forward(std::vector<Blob*> &input, std::vector<Blob> &output)
+    for (size_t i = 0; i < input.size(); i++)
    {
-        for (size_t i = 0; i < input.size(); i++)
-        {
-            int M = (int)input[i]->total(0, axis);
-            int N = numOutputs;
-            int K = innerSize;
-
-            Mat srcMat(M, K, input[i]->type(), input[i]->ptrf());
-            Mat weight(N, K, blobs[0].type(), blobs[0].ptrf());
-            Mat dstMat(M, N, output[i].type(), output[i].ptrf());
-
-            //important: Caffe stores weights as transposed array
-            cv::gemm(srcMat, weight, 1, noArray(), 0, dstMat, GEMM_2_T);
-
-            if (bias)
-            {
-                Mat biasOnesMat = Mat::ones(M, 1, CV_32F);
-                Mat biasMat(1, N, CV_32F, blobs[1].ptrf());
-                cv::gemm(biasOnesMat, biasMat, 1, dstMat, 1, dstMat);
-            }
-        }
+        const XMat srcMat = reshaped(input[i]->getRefConst<XMat>(), Shape(outerSize, innerSize));
+        XMat dstMat = reshaped(output[i].getRef<XMat>(), Shape(outerSize, numOutput));
+        dnn::gemm(srcMat, weight, 1, dstMat, 0, GEMM_2_T);
+
+        if (bias)
+            dnn::gemm(*biasOnesMat, *biasMat, 1, dstMat, 1);
    }
 }
+
+
+Ptr<InnerProductLayer> InnerProductLayer::create(int axis)
+{
+    return Ptr<InnerProductLayer>(new FullyConnectedLayerImpl(axis));
+}
+
+}
 }
--- a/modules/dnn/src/layers/fully_connected_layer.hpp
+++ b/modules/dnn/src/layers/fully_connected_layer.hpp
@ -42,26 +42,30 @@
 #ifndef __OPENCV_DNN_LAYERS_FULLY_CONNECTED_LAYER_HPP__
 #define __OPENCV_DNN_LAYERS_FULLY_CONNECTED_LAYER_HPP__
 #include "../precomp.hpp"
+#include <opencv2/dnn/all_layers.hpp>

 namespace cv
 {
 namespace dnn
 {
-    class FullyConnectedLayer : public Layer
-    {
-        bool bias;
-        int numOutputs;
-        int axis_, axis;

-        int innerSize;
+class FullyConnectedLayerImpl : public InnerProductLayer
+{
+    int axisCan, dtype;
+    int numOutput, innerSize, outerSize;
+    bool bias, useOpenCL;
+    Blob biasOnesBlob;
+
+    template<typename XMat>
+    void forward_(std::vector<Blob*> &input, std::vector<Blob> &output);
+
+public:

-        void reshape(const Blob &inp, Blob &out);
+    FullyConnectedLayerImpl(int axisCan = 1);
+    void allocate(const std::vector<Blob*> &input, std::vector<Blob> &output);
+    void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
+};

-    public:
-        FullyConnectedLayer(LayerParams &params);
-        void allocate(const std::vector<Blob*> &input, std::vector<Blob> &output);
-        void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-    };
 }
 }
 #endif
--- a/modules/dnn/src/layers/layers_common.cpp
+++ b/modules/dnn/src/layers/layers_common.cpp
@ -46,44 +46,5 @@ namespace cv
 namespace dnn
 {

-void getKernelParams(LayerParams &params, int &kernelH, int &kernelW, int &padH, int &padW, int &strideH, int &strideW)
-{
-    if (params.has("kernel_h") && params.has("kernel_w"))
-    {
-        kernelH = params.get<int>("kernel_h");
-        kernelW = params.get<int>("kernel_w");
-    }
-    else if (params.has("kernel_size"))
-    {
-        kernelH = kernelW = params.get<int>("kernel_size");
-    }
-    else
-    {
-        CV_Error(cv::Error::StsBadArg, "kernel_size (or kernel_h and kernel_w) not specified");
-    }
-
-    if (params.has("pad_h") && params.has("pad_w"))
-    {
-        padH = params.get<int>("pad_h");
-        padW = params.get<int>("pad_w");
-    }
-    else
-    {
-        padH = padW = params.get<int>("pad", 0);
-    }
-
-    if (params.has("stride_h") && params.has("stride_w"))
-    {
-        strideH = params.get<int>("stride_h");
-        strideW = params.get<int>("stride_w");
-    }
-    else
-    {
-        strideH = strideW = params.get<int>("stride", 1);
-    }
-
-    CV_Assert(kernelH > 0 && kernelW > 0 && padH >= 0 && padW >= 0 && strideH > 0 && strideW > 0);
-}
-
 }
 }
--- a/modules/dnn/src/layers/layers_common.hpp
+++ b/modules/dnn/src/layers/layers_common.hpp
@ -42,14 +42,14 @@
 #ifndef __OPENCV_DNN_LAYERS_LAYERS_COMMON_HPP__
 #define __OPENCV_DNN_LAYERS_LAYERS_COMMON_HPP__
 #include <opencv2/dnn.hpp>
+#include "op_blas.hpp"
+#include "op_im2col.hpp"

 namespace cv
 {
 namespace dnn
 {

-void getKernelParams(LayerParams &params, int &kernelH, int &kernelW, int &padH, int &padW, int &strideH, int &strideW);
-
 }
 }

--- a/modules/dnn/src/layers/lrn_layer.cpp
+++ b/modules/dnn/src/layers/lrn_layer.cpp
@ -42,123 +42,213 @@
 #include "../precomp.hpp"
 #include "layers_common.hpp"
 #include "lrn_layer.hpp"
+#include "modules/dnn/opencl_kernels_dnn.hpp"
 #include <opencv2/imgproc.hpp>
+#include <opencv2/core/ocl.hpp>
+#include <opencv2/dnn/shape_utils.hpp>
 #include <algorithm>

 namespace cv
 {
 namespace dnn
 {
-    LRNLayer::LRNLayer(LayerParams &params) : Layer(params)
-    {
-        String nrmType = params.get<String>("norm_region", "ACROSS_CHANNELS");
-        if (nrmType == "ACROSS_CHANNELS")
-            type = CHANNEL_NRM;
-        else if (nrmType == "WITHIN_CHANNEL")
-            type = SPATIAL_NRM;
-        else
-            CV_Error(Error::StsBadArg, "Unknown region type \"" + nrmType + "\"");
-
-        size = params.get<int>("local_size", 5);
-        if (size % 2 != 1 || size <= 0)
-            CV_Error(Error::StsBadArg, "LRN layer supports only positive odd values for local_size");
-
-        alpha = params.get<double>("alpha", 1);
-        beta = params.get<double>("beta", 0.75);
-    }

-    void LRNLayer::allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
-    {
-        CV_Assert(inputs.size() == 1);
-        outputs.resize(1);
+LRNLayerImpl::LRNLayerImpl(int type_, int size_, double alpha_, double beta_)
+{
+    type = type_;
+    size = size_;
+    alpha = alpha_;
+    beta = beta_;
+}

-        Vec4i shape = inputs[0]->shape4();
-        outputs[0].create(shape);
+void LRNLayerImpl::allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
+{
+    CV_Assert(inputs.size() == 1 && inputs[0]->dims() == 4);
+    CV_Assert(type == CHANNEL_NRM || type == SPATIAL_NRM);
+    useOpenCL = cv::ocl::useOpenCL();

-        shape[0] = 1; //maybe make shape[0] = 1 too
-        bufBlob.create(shape);
-    }
+    if (type == SPATIAL_NRM && !useOpenCL)
+        buf.create(inputs[0]->shape().slice(2), inputs[0]->type(), Blob::ALLOC_MAT);
+    if (type == CHANNEL_NRM && useOpenCL)
+        buf.create(inputs[0]->shape().slice(2), inputs[0]->type(), Blob::ALLOC_UMAT);

-    void LRNLayer::forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
+    outputs.resize(1);
+    outputs[0].create(inputs[0]->shape(), inputs[0]->type());
+}
+
+void LRNLayerImpl::forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
+{
+    Blob &src = *inputs[0];
+    Blob &dst = outputs[0];
+
+    switch (type)
    {
-        Blob &src = *inputs[0];
-        Blob &dst = outputs[0];
+    case CHANNEL_NRM:
+        channelNoramlization(src, dst);
+        break;
+    case SPATIAL_NRM:
+        spatialNormalization(src, dst);
+        break;
+    default:
+        CV_Error(Error::StsNotImplemented, "Unimplemented mode of LRN layer");
+        break;
+    }
+}

-        switch (type)
-        {
-        case CHANNEL_NRM:
-            channelNoramlization(src, dst);
-            break;
-        case SPATIAL_NRM:
-            spatialNormalization(src, dst);
-            break;
-        default:
-            CV_Error(cv::Error::StsNotImplemented, "Unimplemented mode of LRN layer");
-            break;
-        }
+template<typename XMat>
+static XMat getPlane(XMat &m, int n, int cn)
+{
+    return reshaped(slice(m, n, cn), BlobShape::like(m).slice(2));
+}
+
+void LRNLayerImpl::channelNoramlization(Blob &src, Blob &dst)
+{
+    if (!useOpenCL)
+        channelNoramlization_<Mat>(src, dst);
+    else
+    {
+        //channelNoramlization_ocl(src.getRefConst<UMat>(), dst.getRef<UMat>()); //consumes a lot of memory
+        channelNoramlization_<UMat>(src, dst);
    }
+}

-    void LRNLayer::channelNoramlization(Blob &srcBlob, Blob &dstBlob)
+template<typename XMat>
+void LRNLayerImpl::channelNoramlization_(Blob &srcBlob, Blob &dstBlob)
+{
+    int num = srcBlob.num();
+    int channels = srcBlob.channels();
+    int ksize = (size - 1) / 2;
+
+    XMat srcMat = srcBlob.getRefConst<XMat>();
+    XMat dstMat = dstBlob.getRef<XMat>();
+
+    for (int n = 0; n < num; n++)
    {
-        CV_DbgAssert(srcBlob.ptr() != dstBlob.ptr());
+        XMat accum = getPlane(dstMat, n, channels-1); //trick for memory saving
+        accum.setTo(0);

-        int num = srcBlob.num();
-        int channels = srcBlob.channels();
-        int ksize = (size - 1) / 2;
+        for (int cn = 0; cn < std::min(ksize, channels); cn++)
+            cv::accumulateSquare(getPlane(srcMat, n, cn), accum);

-        for (int n = 0; n < num; n++)
+        for (int cn = 0; cn < channels; cn++)
        {
-            Mat accum = dstBlob.getPlane(n, channels-1); //trick for memory saving
-            accum.setTo(0);
-
-            for (int cn = 0; cn < std::min(ksize, channels); cn++)
-                cv::accumulateSquare(srcBlob.getPlane(n, cn), accum);
+            if (cn + ksize < channels)
+            {
+                cv::accumulateSquare(getPlane(srcMat, n, cn + ksize), accum);
+            }

-            for (int cn = 0; cn < channels; cn++)
+            if (cn - ksize - 1 >= 0)
            {
-                if (cn + ksize < channels)
-                {
-                    cv::accumulateSquare(srcBlob.getPlane(n, cn + ksize), accum);
-                }
-
-                if (cn - ksize - 1 >= 0)
-                {
-                    Mat left = srcBlob.getPlane(n, cn - ksize - 1);
-                    cv::subtract(accum, left.mul(left), accum); //subtractSquare
-                }
-
-                Mat dst = dstBlob.getPlane(n, cn);
-                accum.convertTo(dst, dst.type(), alpha/size, 1);
-                cv::pow(dst, beta, dst);
-                cv::divide(srcBlob.getPlane(n, cn), dst, dst);
+                //subtractSquare
+                XMat left = getPlane(srcMat, n, cn - ksize - 1);
+                cv::pow(left, 2, left);
+                cv::subtract(accum, left, accum);
            }
+
+            XMat dst = getPlane(dstMat, n, cn);
+            accum.convertTo(dst, dst.type(), alpha/size, 1);
+            cv::pow(dst, beta, dst);
+            cv::divide(getPlane(srcMat, n, cn), dst, dst);
        }
    }
+}

-    void LRNLayer::spatialNormalization(Blob &srcBlob, Blob &dstBlob)
-    {
-        int num = srcBlob.num();
-        int channels = srcBlob.channels();
+bool LRNLayerImpl::channelNoramlization_ocl(const UMat &src, UMat &dst)
+{
+#ifdef HAVE_OPENCL
+    if (src.offset != 0 || dst.offset != 0) //TODO: add offset
+        return false;
+
+    String buildOpts = String("-DT=") + ocl::typeToStr(src.type());
+
+    ocl::Kernel kerScale("LRNFillScale", ocl::dnn::lrn_oclsrc, buildOpts);
+    if (kerScale.empty())
+        return false;
+
+    ocl::Kernel kerOutput("LRNComputeOutput", ocl::dnn::lrn_oclsrc, buildOpts);
+    if (kerOutput.empty())
+        return false;
+
+    Shape shape = Shape::like(src);
+    int ksize = (size - 1) / 2;
+    size_t wgSize = ocl::Device::getDefault().maxWorkGroupSize();
+    UMat &scaleBuf = buf.umatRef();
+
+    size_t nthreads = (size_t)(shape.total() / shape[1]);
+    kerScale.args((int)nthreads,
+                  ocl::KernelArg::PtrReadOnly(src), shape[0], shape[1], shape[2], shape[3],
+                  size, (float)(alpha/size), (float)ksize, ocl::KernelArg::PtrWriteOnly(scaleBuf));
+    if (!kerScale.run(1, &nthreads, &wgSize, true))
+        return false;
+
+    nthreads = (size_t)shape.total();
+    kerOutput.args((int)nthreads,
+                   ocl::KernelArg::PtrReadOnly(src), ocl::KernelArg::PtrReadOnly(scaleBuf),
+                   -beta, ocl::KernelArg::PtrWriteOnly(dst) );
+    if (!kerOutput.run(1, &nthreads, &wgSize, true))
+        return false;
+
+    return true;
+#else
+    (void)src;
+    (void)dst;
+    return false;
+#endif
+}
+
+void LRNLayerImpl::spatialNormalization(Blob &src, Blob &dst)
+{
+    if (!useOpenCL)
+        spatialNormalization_<Mat>(src, dst);
+    else
+        spatialNormalization_<UMat>(src, dst);
+}
+
+//TODO: fix cv::boxFilter with BORDER_ISOLATED flag in CPU mode
+template<>
+void LRNLayerImpl::sqrBoxFilter_<Mat>(const Mat &src, Mat &dst)
+{
+    Mat bufMat = buf.getRef<Mat>();
+    src.copyTo(bufMat);
+    cv::sqrBoxFilter(bufMat, dst, dst.depth(), Size(size, size), Point(-1, -1), false, BORDER_CONSTANT);
+}
+
+template<>
+void LRNLayerImpl::sqrBoxFilter_<UMat>(const UMat &src, UMat &dst)
+{
+    cv::sqrBoxFilter(src, dst, dst.depth(), Size(size, size), Point(-1, -1), false, BORDER_CONSTANT | BORDER_ISOLATED);
+}

-        for (int n = 0; n < num; n++)
+template<typename XMat>
+void LRNLayerImpl::spatialNormalization_(Blob &srcBlob, Blob &dstBlob)
+{
+    int num = srcBlob.num();
+    int channels = srcBlob.channels();
+
+    XMat srcMat = srcBlob.getRefConst<XMat>();
+    XMat dstMat = dstBlob.getRef<XMat>();
+
+    for (int n = 0; n < num; n++)
+    {
+        for (int cn = 0; cn < channels; cn++)
        {
-            for (int cn = 0; cn < channels; cn++)
-            {
-                Mat src = srcBlob.getPlane(n, cn);
-                Mat dst = dstBlob.getPlane(n, cn);
-                uchar *dataDst0 = dst.data;
-
-                cv::pow(srcBlob.getPlane(n, cn), 2, dst);
-                //TODO: check border type
-                cv::boxFilter(dst, dst, dst.depth(), cv::Size(size, size), cv::Point(-1, -1), false, cv::BORDER_CONSTANT);
-                dst.convertTo(dst, dst.type(), alpha/(size*size), 1);
-                cv::pow(dst, beta, dst);
-                cv::divide(src, dst, dst);
-
-                CV_Assert(dataDst0 == dst.data); //debug
-            }
+            XMat src = getPlane(srcMat, n, cn);
+            XMat dst = getPlane(dstMat, n, cn);
+
+            sqrBoxFilter_(src, dst);
+
+            dst.convertTo(dst, dst.type(), alpha/(size*size), 1);
+            cv::pow(dst, beta, dst);
+            cv::divide(src, dst, dst);
        }
    }
+}
+
+
+Ptr<LRNLayer> LRNLayer::create(int type, int size, double alpha, double beta)
+{
+    return Ptr<LRNLayer>(new LRNLayerImpl(type, size, alpha, beta));
+}

 }
 }
--- a/modules/dnn/src/layers/lrn_layer.hpp
+++ b/modules/dnn/src/layers/lrn_layer.hpp
@ -42,34 +42,36 @@
 #ifndef __OPENCV_DNN_LAYERS_LRN_LAYER_HPP__
 #define __OPENCV_DNN_LAYERS_LRN_LAYER_HPP__
 #include "../precomp.hpp"
+#include <opencv2/dnn/all_layers.hpp>

 namespace cv
 {
 namespace dnn
 {
-    class LRNLayer : public Layer
-    {
-        enum
-        {
-            CHANNEL_NRM,
-            SPATIAL_NRM,
-            SPATIAL_CONTRAST_NRM //cuda-convnet feature
-        } type;

-        int size;
-        double alpha, beta;
+class LRNLayerImpl : public LRNLayer
+{
+    bool useOpenCL;
+    Blob buf;
+
+    void channelNoramlization(Blob &src, Blob &dst);
+    template<typename XMat>
+    void channelNoramlization_(Blob &src, Blob &dst);
+    bool channelNoramlization_ocl(const UMat &src, UMat &dst);

-        Blob bufBlob;
+    void spatialNormalization(Blob &src, Blob &dst);
+    template<typename XMat>
+    void spatialNormalization_(Blob &src, Blob &dst);
+    template<typename XMat>
+    void sqrBoxFilter_(const XMat &src, XMat &dst);

-        void channelNoramlization(Blob &src, Blob &dst);
-        void spatialNormalization(Blob &src, Blob &dst);
+public:

-    public:
+    LRNLayerImpl(int type = CHANNEL_NRM, int size = 5, double alpha = 1, double beta = 0.75);
+    void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
+    void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
+};

-        LRNLayer(LayerParams &params);
-        void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-        void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-    };
 }
 }
 #endif
--- a/modules/dnn/src/layers/mvn_layer.cpp
+++ b/modules/dnn/src/layers/mvn_layer.cpp
@ -42,20 +42,21 @@
 #include "../precomp.hpp"
 #include "layers_common.hpp"
 #include "mvn_layer.hpp"
+#include <opencv2/dnn/shape_utils.hpp>

 namespace cv
 {
 namespace dnn
 {

-MVNLayer::MVNLayer(LayerParams &params) : Layer(params)
+MVNLayerImpl::MVNLayerImpl(bool normVariance_, bool acrossChannels_, double eps_)
 {
-    eps = params.get<double>("eps", 1e-9);
-    acrossChannels = params.get<bool>("across_channels", false);
-    normalizeVariance = params.get<bool>("normalize_variance", true);
+    normVariance = normVariance_;
+    acrossChannels = acrossChannels_;
+    eps = eps_;
 }

-void MVNLayer::allocate(const std::vector<Blob *> &inputs, std::vector<Blob> &outputs)
+void MVNLayerImpl::allocate(const std::vector<Blob *> &inputs, std::vector<Blob> &outputs)
 {
    outputs.resize(inputs.size());
    for (size_t i = 0; i < inputs.size(); i++)
@ -65,20 +66,17 @@ void MVNLayer::allocate(const std::vector<Blob *> &inputs, std::vector<Blob> &ou
    }
 }

-void MVNLayer::forward(std::vector<Blob *> &inputs, std::vector<Blob> &outputs)
+void MVNLayerImpl::forward(std::vector<Blob *> &inputs, std::vector<Blob> &outputs)
 {
    for (size_t inpIdx = 0; inpIdx < inputs.size(); inpIdx++)
    {
        Blob &inpBlob = *inputs[inpIdx];
        Blob &outBlob = outputs[inpIdx];

-        int workSize[2];
        int splitDim = (acrossChannels) ? 1 : 2;
-        workSize[0] = (int)inpBlob.total(0, splitDim);
-        workSize[1] = (int)inpBlob.total(splitDim);
-
-        Mat inpMat = inpBlob.matRef().reshape(1, 2, workSize);
-        Mat outMat = outBlob.matRef().reshape(1, 2, workSize);
+        Shape workSize((int)inpBlob.total(0, splitDim), (int)inpBlob.total(splitDim));
+        Mat inpMat = reshaped(inpBlob.matRefConst(), workSize);
+        Mat outMat = reshaped(outBlob.matRef(), workSize);

        Scalar mean, dev;
        for (int i = 0; i < workSize[0]; i++)
@ -86,12 +84,18 @@ void MVNLayer::forward(std::vector<Blob *> &inputs, std::vector<Blob> &outputs)
            Mat inpRow = inpMat.row(i);
            Mat outRow = outMat.row(i);

-            cv::meanStdDev(inpRow, mean, (normalizeVariance) ? dev : noArray());
-            double alpha = (normalizeVariance) ? 1/(eps + dev[0]) : 1;
+            cv::meanStdDev(inpRow, mean, (normVariance) ? dev : noArray());
+            double alpha = (normVariance) ? 1/(eps + dev[0]) : 1;
            inpRow.convertTo(outRow, outRow.type(), alpha, -mean[0] * alpha);
        }
    }
 }

+
+Ptr<MVNLayer> MVNLayer::create(bool normVariance, bool acrossChannels, double eps)
+{
+    return Ptr<MVNLayer>(new MVNLayerImpl(normVariance, acrossChannels, eps));
+}
+
 }
 }
--- a/modules/dnn/src/layers/mvn_layer.hpp
+++ b/modules/dnn/src/layers/mvn_layer.hpp
@ -42,20 +42,18 @@
 #ifndef __OPENCV_DNN_LAYERS_MVN_LAYER_HPP__
 #define __OPENCV_DNN_LAYERS_MVN_LAYER_HPP__
 #include "../precomp.hpp"
+#include <opencv2/dnn/all_layers.hpp>

 namespace cv
 {
 namespace dnn
 {

-class MVNLayer : public Layer
+class MVNLayerImpl : public MVNLayer
 {
-    double eps;
-    bool acrossChannels, normalizeVariance;
-
 public:

-    MVNLayer(LayerParams &params);
+    MVNLayerImpl(bool normVariance_ = true, bool acrossChannels_ = false, double eps_ = 1e-9);
    void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
    void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
 };
--- a/modules/dnn/src/layers/op_blas.cpp
+++ b/modules/dnn/src/layers/op_blas.cpp
@ -0,0 +1,95 @@
+#include "op_blas.hpp"
+
+#if HAVE_CBLAS
+#include "opencv_cblas.hpp"
+#endif
+
+#include <iostream>
+
+namespace cv
+{
+namespace dnn
+{
+
+void gemm(InputArray A, InputArray B, double alpha, InputOutputArray C, double beta, int flags)
+{
+    if (C.isMat())
+        gemmCPU(A.getMat(), B.getMat(), alpha, C.getMatRef(), beta, flags);
+    else
+    {
+        cv::gemm(A, B, alpha, (beta == 0) ? noArray() : C, beta, C, flags);
+    }
+}
+
+inline void SwapRowCols(const Mat &A, int &rows, int &cols, bool isTrans)
+{
+    CV_DbgAssert(A.dims == 2);
+    rows = (isTrans) ? A.cols : A.rows;
+    cols = (isTrans) ? A.rows : A.cols;
+}
+
+void gemmCPU(const Mat &A, const Mat &B, double alpha, Mat &C, double beta, int flags /*= 0*/)
+{
+    #if HAVE_CBLAS
+    bool transA = static_cast<bool>(flags & GEMM_1_T);
+    bool transB = static_cast<bool>(flags & GEMM_2_T);
+    bool transC = static_cast<bool>(flags & GEMM_3_T);
+
+    int Arows, Acols, Brows, Bcols, Crows, Ccols;
+    SwapRowCols(A, Arows, Acols, transA);
+    SwapRowCols(B, Brows, Bcols, transB);
+    SwapRowCols(C, Crows, Ccols, transC);
+
+    CV_Assert(!(flags & GEMM_3_T));
+    CV_Assert(Acols == Brows && Arows == Crows && Bcols == Ccols);
+    CV_Assert(A.isContinuous() && B.isContinuous() && C.isContinuous());
+    CV_Assert(A.type() == B.type() && B.type() == C.type());
+    CV_Assert(A.data != C.data && B.data != C.data);
+
+    if (C.type() == CV_32F)
+    {
+        cblas_sgemm(CblasRowMajor, transA ? CblasTrans : CblasNoTrans, transB ? CblasTrans : CblasNoTrans,
+                    Arows, Bcols, Acols,
+                    (float)alpha, A.ptr<float>(), A.cols,
+                    B.ptr<float>(), B.cols,
+                    (float)beta, C.ptr<float>(), C.cols);
+    }
+    else if (C.type() == CV_64F)
+    {
+        //TODO: Should be tested
+        cblas_dgemm(CblasRowMajor, transA ? CblasTrans : CblasNoTrans, transB ? CblasTrans : CblasNoTrans,
+                    Arows, Bcols, Acols,
+                    alpha, A.ptr<double>(), A.cols,
+                    B.ptr<double>(), B.cols,
+                    beta, C.ptr<double>(), C.cols);
+    }
+    else
+    {
+        CV_Error(Error::BadDepth, "Only floating point types are supported");
+    }
+    #else
+    cv::gemm(A, B, alpha, C, beta, C, flags);
+    #endif
+}
+
+int getBlasThreads()
+{
+    #ifdef OPENBLAS_VERSION
+    return openblas_get_num_threads();
+    #else
+    return 1;
+    #endif
+}
+
+void setBlasThreads(int numThreads)
+{
+    #ifdef OPENBLAS_VERSION
+    openblas_set_num_threads(numThreads);
+    goto_set_num_threads(numThreads);
+    #else
+    (void)numThreads;   //suppress compilers' warning
+    #endif
+}
+
+}
+}
--- a/modules/dnn/src/layers/op_blas.hpp
+++ b/modules/dnn/src/layers/op_blas.hpp
@ -39,47 +39,21 @@
 //
 //M*/

+#ifndef __OPENCV_DNN_LAYERS_OP_BLAS_HPP__
+#define __OPENCV_DNN_LAYERS_OP_BLAS_HPP__
 #include "../precomp.hpp"
-#include <opencv2/core/ocl.hpp>
-#include "im2col.hpp"
-#include "opencl_kernels_dnn.hpp"

 namespace cv
 {
 namespace dnn
 {
+    int getBlasThreads();

-#ifdef HAVE_OPENCL
-void im2col_ocl(UMat &img,
-                int channels, int height, int width,
-                int kernel_h, int kernel_w,
-                int pad_h, int pad_w,
-                int stride_h, int stride_w,
-                UMat &col)
-{
-    int h_out = (height + 2 * pad_h - kernel_h) / stride_h + 1;
-    int w_out = (width + 2 * pad_w - kernel_w) / stride_w + 1;
-
-    CV_Assert(img.isContinuous() && col.isContinuous());
-    CV_Assert(img.total() == (size_t)channels * height * width);
-    CV_Assert(col.total() == (size_t)channels * kernel_h * kernel_w * h_out * w_out);
-
-    ocl::Kernel im2col_ker("im2col", ocl::dnn::im2col_oclsrc);
-    CV_Assert(!im2col_ker.empty());
+    void setBlasThreads(int numThreads);

-    im2col_ker.args(ocl::KernelArg::PtrReadOnly(img), (int)img.offset,
-             channels, height, width,
-             kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
-             h_out, w_out,
-             ocl::KernelArg::PtrWriteOnly(col), (int)col.offset
-        );
-
-    size_t localSize = ocl::Device::getDefault().maxWorkGroupSize();
-    size_t globalSize = (size_t)channels * h_out * w_out;
-
-    CV_Assert(im2col_ker.run(1, &globalSize, &localSize, true));
-}
-#endif // HAVE_OPENCL
+    void gemm(InputArray A, InputArray B, double alpha, InputOutputArray C, double beta, int flags = 0);

+    void gemmCPU(const Mat &A, const Mat &B, double alpha, Mat &C, double beta, int flags = 0);
 }
 }
+#endif
--- a/modules/dnn/src/layers/op_im2col.cpp
+++ b/modules/dnn/src/layers/op_im2col.cpp
@ -39,88 +39,84 @@
 //
 //M*/

-#ifndef __OPENCV_DNN_LAYERS_IM2COL_HPP__
-#define __OPENCV_DNN_LAYERS_IM2COL_HPP__
+#include "../precomp.hpp"
+#include <opencv2/core/ocl.hpp>
+#include "opencl_kernels_dnn.hpp"
+#include "op_im2col.hpp"

 namespace cv
 {
 namespace dnn
 {

-template <typename Dtype>
-void im2col_cpu(const Dtype* data_im,
-                int channels, int height, int width,
-                int kernel_h, int kernel_w,
-                int pad_h, int pad_w,
-                int stride_h, int stride_w,
-                Dtype* data_col)
+#ifdef HAVE_OPENCL
+
+bool im2col_ocl(const UMat &img,
+                 int channels, int height, int width,
+                 int kernel_h, int kernel_w,
+                 int pad_h, int pad_w,
+                 int stride_h, int stride_w,
+                 UMat &col)
 {
    int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
    int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
    int channels_col = channels * kernel_h * kernel_w;
-    for (int c = 0; c < channels_col; ++c) {
-        int w_offset = c % kernel_w;
-        int h_offset = (c / kernel_w) % kernel_h;
-        int c_im = c / kernel_h / kernel_w;
-        for (int h = 0; h < height_col; ++h) {
-            for (int w = 0; w < width_col; ++w) {
-                int h_pad = h * stride_h - pad_h + h_offset;
-                int w_pad = w * stride_w - pad_w + w_offset;
-                if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
-                    data_col[(c * height_col + h) * width_col + w] =
-                    data_im[(c_im * height + h_pad) * width + w_pad];
-                else
-                    data_col[(c * height_col + h) * width_col + w] = 0;
-            }
-        }
-    }
+    int esz = img.elemSize();
+
+    CV_Assert(img.isContinuous() && col.isContinuous());
+    CV_Assert(img.total() == (size_t)channels * height * width);
+    CV_Assert(col.total() == (size_t)channels_col * height_col * width_col);
+
+    ocl::Kernel ker("im2col", ocl::dnn::im2col_oclsrc, String("-DT=") + ocl::typeToStr(img.type()));
+    if (ker.empty())
+        return false;
+
+    ker.args(ocl::KernelArg::PtrReadOnly(img), (int)img.offset/esz,
+             channels, height, width,
+             kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+             height_col, width_col,
+             ocl::KernelArg::PtrWriteOnly(col), (int)col.offset/esz
+             );
+
+    size_t localSize = ocl::Device::getDefault().maxWorkGroupSize();
+    size_t globalSize = (size_t)channels * height_col * width_col;
+    return ker.run(1, &globalSize, &localSize, true);
 }

-template <typename Dtype>
-void col2im_cpu(const Dtype* data_col,
+bool col2im_ocl(const UMat &col,
                int channels, int height, int width,
-                int patch_h, int patch_w,
+                int kernel_h, int kernel_w,
                int pad_h, int pad_w,
                int stride_h, int stride_w,
-                Dtype* data_im)
+                UMat &img)
 {
-    memset(data_im, 0, height * width * channels * sizeof(Dtype));
+    int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
+    int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
+    int channels_col = channels * kernel_h * kernel_w;
+    int esz = img.elemSize();

-    int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
-    int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
-    int channels_col = channels * patch_h * patch_w;
+    CV_Assert(img.isContinuous() && col.isContinuous());
+    CV_Assert(img.total() == (size_t)channels * height * width);
+    CV_Assert(col.total() == (size_t)channels_col * height_col * width_col);

-    for (int c = 0; c < channels_col; ++c)
-    {
-        int w_offset = c % patch_w;
-        int h_offset = (c / patch_w) % patch_h;
-        int c_im = c / patch_h / patch_w;
+    ocl::Kernel ker("col2im", ocl::dnn::col2im_oclsrc, String("-DT=") + ocl::typeToStr(col.type()));
+    if (ker.empty())
+        return false;

-        for (int h = 0; h < height_col; ++h)
-        {
-            for (int w = 0; w < width_col; ++w)
-            {
-                int h_pad = h * stride_h - pad_h + h_offset;
-                int w_pad = w * stride_w - pad_w + w_offset;
+    ker.args((int)img.total(),
+             ocl::KernelArg::PtrReadOnly(col), (int)col.offset/esz,
+             height, width, channels,
+             kernel_h, kernel_w,
+             pad_h, pad_w,
+             stride_h, stride_w,
+             height_col, width_col,
+             ocl::KernelArg::PtrWriteOnly(img), (int)img.offset/esz);

-                if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
-                    data_im[(c_im * height + h_pad) * width + w_pad] +=
-                    data_col[(c * height_col + h) * width_col + w];
-            }
-        }
-    }
+    size_t localSize = ocl::Device::getDefault().maxWorkGroupSize();
+    size_t globalSize = img.total();
+    return ker.run(1, &globalSize, &localSize, true);
 }

-#ifdef HAVE_OPENCL
-void im2col_ocl(UMat &img,
-                int channels, int height, int width,
-                int kernel_h, int kernel_w,
-                int pad_h, int pad_w,
-                int stride_h, int stride_w,
-                UMat &col);
 #endif
-
 }
 }
-
-#endif
--- a/modules/dnn/src/layers/op_im2col.hpp
+++ b/modules/dnn/src/layers/op_im2col.hpp
@ -0,0 +1,231 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_DNN_LAYERS_IM2COL_HPP__
+#define __OPENCV_DNN_LAYERS_IM2COL_HPP__
+#include "../precomp.hpp"
+#include <iostream>
+
+namespace cv
+{
+namespace dnn
+{
+
+template <typename Dtype>
+class im2col_CpuPBody : public cv::ParallelLoopBody
+{
+    const Dtype* data_im;
+    int channels, height, width;
+    int kernel_h, kernel_w;
+    int pad_h, pad_w;
+    int stride_h, stride_w;
+    Dtype* data_col;
+    int height_col, width_col, channels_col;
+
+    im2col_CpuPBody() {}
+public:
+
+    static void run(const Dtype* data_im,
+                    int channels, int height, int width,
+                    int kernel_h, int kernel_w,
+                    int pad_h, int pad_w,
+                    int stride_h, int stride_w,
+                    Dtype* data_col)
+    {
+        im2col_CpuPBody<Dtype> t;
+        t.data_im = data_im;
+        t.data_col = data_col;
+        t.channels = channels; t.height = height; t.width = width;
+        t.kernel_h = kernel_h; t.kernel_w = kernel_w;
+        t.pad_h = pad_h; t.pad_w = pad_w;
+        t.stride_h = stride_h; t.stride_w = stride_w;
+        t.height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
+        t.width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
+        t.channels_col = channels * kernel_h * kernel_w;
+
+        cv::parallel_for_(Range(0, t.channels_col), t);
+    }
+
+    virtual void operator ()(const Range &r) const
+    {
+        for (int c = r.start; c < r.end; ++c) {
+            int w_offset = c % kernel_w;
+            int h_offset = (c / kernel_w) % kernel_h;
+            int c_im = c / kernel_h / kernel_w;
+            for (int h = 0; h < height_col; ++h) {
+                for (int w = 0; w < width_col; ++w) {
+                    int h_pad = h * stride_h - pad_h + h_offset;
+                    int w_pad = w * stride_w - pad_w + w_offset;
+                    if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
+                        data_col[(c * height_col + h) * width_col + w] =
+                        data_im[(c_im * height + h_pad) * width + w_pad];
+                    else
+                        data_col[(c * height_col + h) * width_col + w] = 0;
+                }
+            }
+        }
+    }
+};
+
+template <typename Dtype>
+class col2im_CpuPBody : public cv::ParallelLoopBody
+{
+    const Dtype* data_col;
+    int channels, height, width;
+    int kernel_h, kernel_w;
+    int pad_h, pad_w;
+    int stride_h, stride_w;
+    Dtype* data_im;
+    int height_col, width_col;
+
+    col2im_CpuPBody() {}
+
+public:
+
+    static void run(const Dtype* data_col,
+                    int channels, int height, int width,
+                    int kernel_h, int kernel_w,
+                    int pad_h, int pad_w,
+                    int stride_h, int stride_w,
+                    Dtype* data_im)
+    {
+        //TODO: single-threaded version switch
+
+        col2im_CpuPBody t;
+        t.data_col = data_col;
+        t.data_im = data_im;
+        t.channels = channels; t.height = height; t.width = width;
+        t.kernel_h = kernel_h; t.kernel_w = kernel_w;
+        t.pad_h = pad_h; t.pad_w = pad_w;
+        t.stride_h = stride_h; t.stride_w = stride_w;
+        t.height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
+        t.width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
+        int img_total = channels * height * width;
+
+        cv::parallel_for_(Range(0, img_total), t);
+    }
+
+    virtual void operator ()(const Range &r) const
+    {
+        for (int index = r.start; index < r.end; index++)
+        {
+            Dtype val = 0;
+            int w = index % width + pad_w;
+            int h = (index / width) % height + pad_h;
+            int c = index / (width * height);
+
+            // compute the start and end of the output
+            int w_col_start = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
+            int w_col_end = std::min(w / stride_w + 1, width_col);
+            int h_col_start = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
+            int h_col_end = std::min(h / stride_h + 1, height_col);
+
+            // equivalent implementation
+            int offset =
+            (c * kernel_h * kernel_w + h * kernel_w + w) * height_col * width_col;
+            int coeff_h_col = (1 - stride_h * kernel_w * height_col) * width_col;
+            int coeff_w_col = (1 - stride_w * height_col * width_col);
+            for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
+              for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
+                val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
+              }
+            }
+            data_im[index] = val;
+        }
+    }
+};
+
+//single-threaded version
+template <typename Dtype>
+void col2im_cpu(const Dtype* data_col,
+                int channels, int height, int width,
+                int kernel_h, int kernel_w,
+                int pad_h, int pad_w,
+                int stride_h, int stride_w,
+                Dtype* data_im)
+{
+    int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
+    int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
+    int channels_col = channels * kernel_h * kernel_w;
+
+    std::memset(data_im, 0, height * width * channels * sizeof(Dtype));
+
+    for (int c = 0; c < channels_col; ++c)
+    {
+        int w_offset = c % kernel_w;
+        int h_offset = (c / kernel_w) % kernel_h;
+        int c_im = c / kernel_h / kernel_w;
+
+        for (int h = 0; h < height_col; ++h)
+        {
+            for (int w = 0; w < width_col; ++w)
+            {
+                int h_pad = h * stride_h - pad_h + h_offset;
+                int w_pad = w * stride_w - pad_w + w_offset;
+
+                if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
+                    data_im[(c_im * height + h_pad) * width + w_pad] +=
+                    data_col[(c * height_col + h) * width_col + w];
+            }
+        }
+    }
+}
+
+#ifdef HAVE_OPENCL
+bool im2col_ocl(const UMat &img,
+                int channels, int height, int width,
+                int kernel_h, int kernel_w,
+                int pad_h, int pad_w,
+                int stride_h, int stride_w,
+                UMat &col);
+
+bool col2im_ocl(const UMat &col,
+                int channels, int height, int width,
+                int kernel_h, int kernel_w,
+                int pad_h, int pad_w,
+                int stride_h, int stride_w,
+                UMat &img);
+#endif
+
+}
+}
+
+#endif
--- a/modules/dnn/src/layers/pooling_layer.cpp
+++ b/modules/dnn/src/layers/pooling_layer.cpp
@ -42,8 +42,10 @@
 #include "../precomp.hpp"
 #include "layers_common.hpp"
 #include "pooling_layer.hpp"
+#include "opencl_kernels_dnn.hpp"
 #include <float.h>
 #include <algorithm>
+#include <opencv2/core/ocl.hpp>
 using std::max;
 using std::min;

@ -53,155 +55,216 @@ namespace dnn
 {
 //TODO: add ceil_mode param

-    PoolingLayer::PoolingLayer(LayerParams &params) : Layer(params)
-    {
-        if (params.has("pool"))
-        {
-            String pool = params.get<String>("pool").toLowerCase();
-            if (pool == "max")
-                type = MAX;
-            else if (pool == "ave")
-                type = AVE;
-            else if (pool == "stochastic")
-                type = STOCHASTIC;
-            else
-                CV_Error(cv::Error::StsBadArg, "Unknown pooling type \"" + pool + "\"");
-        }
-        else
-        {
-            type = MAX;
-        }
+PoolingLayerImpl::PoolingLayerImpl()
+{

-        getKernelParams(params, kernelH, kernelW, padH, padW, strideH, strideW);
-    }
+}

-    void PoolingLayer::allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
-    {
-        CV_Assert(inputs.size() > 0);
+PoolingLayerImpl::PoolingLayerImpl(int type_, Size kernel_, Size stride_, Size pad_)
+{
+    type = type_;
+    kernel = kernel_;
+    pad = pad_;
+    stride = stride_;
+}

-        inpW = inputs[0]->cols();
-        inpH = inputs[0]->rows();
-        computeOutputShape(inpH, inpW);
+void PoolingLayerImpl::allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
+{
+    CV_Assert(inputs.size() > 0);

-        outputs.resize(inputs.size());
-        for (size_t i = 0; i < inputs.size(); i++)
-        {
-            CV_Assert(inputs[i]->rows() == inpH && inputs[i]->cols() == inpW);
-            outputs[i].create(BlobShape(inputs[i]->num(), inputs[i]->channels(), outH, outW));
-        }
+    inp = inputs[0]->size2();
+    computeOutputShape(inp);
+
+    useOpenCL = ocl::useOpenCL();
+
+    outputs.resize(inputs.size());
+    for (size_t i = 0; i < inputs.size(); i++)
+    {
+        CV_Assert(inputs[i]->rows() == inp.height && inputs[i]->cols() == inp.width);
+        outputs[i].create(BlobShape(inputs[i]->num(), inputs[i]->channels(), out.height, out.width));
    }
+}

-    void PoolingLayer::forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
+void PoolingLayerImpl::forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
+{
+    for (size_t ii = 0; ii < inputs.size(); ii++)
    {
-        for (size_t ii = 0; ii < inputs.size(); ii++)
+        switch (type)
        {
-            switch (type)
-            {
-            case MAX:
-                maxPooling(*inputs[ii], outputs[ii]);
-                break;
-            case AVE:
-                avePooling(*inputs[ii], outputs[ii]);
-                break;
-            default:
-                CV_Error(cv::Error::StsNotImplemented, "Not implemented");
-                break;
-            }
+        case MAX:
+            maxPooling(*inputs[ii], outputs[ii]);
+            break;
+        case AVE:
+            avePooling(*inputs[ii], outputs[ii]);
+            break;
+        default:
+            CV_Error(Error::StsNotImplemented, "Not implemented");
+            break;
        }
    }
+}
+
+void PoolingLayerImpl::maxPooling(Blob &src, Blob &dst)
+{
+    if (!useOpenCL)
+        maxPooling_cpu(src, dst);
+    else
+    {
+        CV_Assert(maxPooling_ocl(src, dst));
+    }
+}

-    void PoolingLayer::maxPooling(Blob &input, Blob &output)
+bool PoolingLayerImpl::maxPooling_ocl(Blob &src, Blob &dst)
+{
+    return pooling_ocl("MaxPoolForward", src, dst);
+}
+
+void PoolingLayerImpl::avePooling(Blob &src, Blob &dst)
+{
+    if (!useOpenCL)
+        avePooling_cpu(src, dst);
+    else
    {
-        CV_DbgAssert(output.rows() == outH && output.cols() == outW);
+        CV_Assert(avePooling_ocl(src, dst));
+    }
+}
+
+bool PoolingLayerImpl::avePooling_ocl(Blob &src, Blob &dst)
+{
+    return pooling_ocl("AvePoolForward", src, dst);
+}

-        for (int n = 0; n < input.num(); ++n)
+void PoolingLayerImpl::maxPooling_cpu(Blob &src, Blob &dst)
+{
+    CV_DbgAssert(dst.rows() == out.height && dst.cols() == out.width);
+
+    for (int n = 0; n < src.num(); ++n)
+    {
+        for (int c = 0; c < src.channels(); ++c)
        {
-            for (int c = 0; c < input.channels(); ++c)
-            {
-                float *srcData = input.ptrf(n, c);
-                float *dstData = output.ptrf(n, c);
+            const float *srcData = src.ptrf(n, c);
+            float *dstData = dst.ptrf(n, c);

-                for (int ph = 0; ph < outH; ++ph)
+            for (int ph = 0; ph < out.height; ++ph)
+            {
+                for (int pw = 0; pw < out.width; ++pw)
                {
-                    for (int pw = 0; pw < outW; ++pw)
-                    {
-                        int hstart = ph * strideH - padH;
-                        int wstart = pw * strideW - padW;
-                        int hend = min(hstart + kernelH, inpH);
-                        int wend = min(wstart + kernelW, inpW);
-                        hstart = max(hstart, 0);
-                        wstart = max(wstart, 0);
-                        const int poolIndex = ph * outW + pw;
-                        float max_val = -FLT_MAX;
-
-                        for (int h = hstart; h < hend; ++h)
-                            for (int w = wstart; w < wend; ++w)
-                            {
-                                const int index = h * inpW + w;
-                                if (srcData[index] > max_val)
-                                    max_val = srcData[index];
-                            }
-
-                        dstData[poolIndex] = max_val;
-                    }
+                    int hstart = ph * stride.height - pad.height;
+                    int wstart = pw * stride.width - pad.width;
+                    int hend = min(hstart + kernel.height, inp.height);
+                    int wend = min(wstart + kernel.width, inp.width);
+                    hstart = max(hstart, 0);
+                    wstart = max(wstart, 0);
+                    const int poolIndex = ph * out.width + pw;
+                    float max_val = -FLT_MAX;
+
+                    for (int h = hstart; h < hend; ++h)
+                        for (int w = wstart; w < wend; ++w)
+                        {
+                            const int index = h * inp.width + w;
+                            if (srcData[index] > max_val)
+                                max_val = srcData[index];
+                        }
+
+                    dstData[poolIndex] = max_val;
                }
            }
        }
    }
+}
+
+
+#ifdef HAVE_OPENCL
+bool PoolingLayerImpl::pooling_ocl(const char *kname, const Blob &src, Blob &dst, Blob *mask)
+{
+    const UMat &srcMat = src.umatRefConst();
+    UMat &dstMat = dst.umatRef();
+    CV_Assert(mask == NULL && srcMat.offset == 0 && dstMat.offset == 0);
+
+    ocl::Kernel ker(kname, ocl::dnn::pooling_oclsrc, String("-DT=") + ocl::typeToStr(src.type()));
+    if (ker.empty())
+        return false;
+
+    BlobShape s = src.shape();
+    size_t nthreads = dst.total();
+    ker.args((int)nthreads,
+             ocl::KernelArg::PtrReadOnly(srcMat), s[0], s[1], s[2], s[3],
+             out.height, out.width, kernel.height, kernel.width,
+             stride.height, stride.width, pad.height, pad.width,
+             ocl::KernelArg::PtrWriteOnly(dstMat));
+
+    size_t wgSize = ocl::Device::getDefault().maxWorkGroupSize();
+    if (!ker.run(1, &nthreads, &wgSize, true))
+        return false;
+
+    return true;
+}
+#else
+bool PoolingLayerImpl::pooling_ocl(const char*, const Blob&, Blob&, Blob*)
+{
+    return false;
+}
+#endif

-    void PoolingLayer::avePooling(Blob &input, Blob &output)
+void PoolingLayerImpl::avePooling_cpu(Blob &src, Blob &dst)
+{
+    for (int n = 0; n < src.num(); ++n)
    {
-        for (int n = 0; n < input.num(); ++n)
+        for (int c = 0; c < src.channels(); ++c)
        {
-            for (int c = 0; c < input.channels(); ++c)
-            {
-                float *srcData = input.ptrf(n, c);
-                float *dstData = output.ptrf(n, c);
+            const float *srcData = src.ptrf(n, c);
+            float *dstData = dst.ptrf(n, c);

-                for (int ph = 0; ph < outH; ++ph)
+            for (int ph = 0; ph < out.height; ++ph)
+            {
+                for (int pw = 0; pw < out.width; ++pw)
                {
-                    for (int pw = 0; pw < outW; ++pw)
-                    {
-                        int hstart = ph * strideH - padH;
-                        int wstart = pw * strideW - padW;
-                        int hend = min(hstart + kernelH, inpH + padH);
-                        int wend = min(wstart + kernelW, inpW + padW);
-                        int poolSize = (hend - hstart) * (wend - wstart);
-                        hstart = max(hstart, 0);
-                        wstart = max(wstart, 0);
-                        hend = min(hend, inpH);
-                        wend = min(wend, inpW);
-
-                        dstData[ph * outW + pw] = 0.f;
-
-                        for (int h = hstart; h < hend; ++h)
-                            for (int w = wstart; w < wend; ++w)
-                                dstData[ph * outW + pw] += srcData[h * inpW + w];
-
-                        dstData[ph * outW + pw] /= poolSize;
-                    }
+                    int hstart = ph * stride.height - pad.height;
+                    int wstart = pw * stride.width - pad.width;
+                    int hend = min(hstart + kernel.height, inp.height + pad.height);
+                    int wend = min(wstart + kernel.width, inp.width + pad.width);
+                    int poolSize = (hend - hstart) * (wend - wstart);
+                    hstart = max(hstart, 0);
+                    wstart = max(wstart, 0);
+                    hend = min(hend, inp.height);
+                    wend = min(wend, inp.width);
+
+                    dstData[ph * out.width + pw] = 0.f;
+
+                    for (int h = hstart; h < hend; ++h)
+                        for (int w = wstart; w < wend; ++w)
+                            dstData[ph * out.width + pw] += srcData[h * inp.width + w];
+
+                    dstData[ph * out.width + pw] /= poolSize;
                }
-          }
+            }
        }
    }
+}

-    void PoolingLayer::computeOutputShape(int inH, int inW)
-    {
-        //Yeah, something strange Caffe scheme-)
-        outH = static_cast<int>(ceil(static_cast<float>(inH + 2 * padH - kernelH) / strideH)) + 1;
-        outW = static_cast<int>(ceil(static_cast<float>(inW + 2 * padW - kernelW) / strideW)) + 1;
+void PoolingLayerImpl::computeOutputShape(Size inpSz)
+{
+    //Yeah, something strange Caffe scheme-)
+    out.height = static_cast<int>(ceil(static_cast<float>(inpSz.height + 2 * pad.height - kernel.height) / stride.height)) + 1;
+    out.width = static_cast<int>(ceil(static_cast<float>(inpSz.width + 2 * pad.width - kernel.width) / stride.width)) + 1;

-        if (padH || padW)
-        {
-            // If we have padding, ensure that the last pooling starts strictly
-            // inside the image (instead of at the padding); otherwise clip the last.
-            if ((outH - 1) * strideH >= inH + padH)
-                --outH;
-            if ((outW - 1) * strideW >= inW + padW)
-                --outW;
-            CV_Assert((outH - 1) * strideH < inH + padH);
-            CV_Assert((outW - 1) * strideW < inW + padW);
-        }
+    if (pad.height || pad.width)
+    {
+        // If we have padding, ensure that the last pooling starts strictly
+        // inside the image (instead of at the padding); otherwise clip the last.
+        if ((out.height - 1) * stride.height >= inpSz.height + pad.height)
+            --out.height;
+        if ((out.width - 1) * stride.width >= inpSz.width + pad.width)
+            --out.width;
+        CV_Assert((out.height - 1) * stride.height < inpSz.height + pad.height);
+        CV_Assert((out.width - 1) * stride.width < inpSz.width + pad.width);
    }
 }
+
+Ptr<PoolingLayer> PoolingLayer::create(int type, Size kernel, Size stride, Size pad)
+{
+    return Ptr<PoolingLayer>(new PoolingLayerImpl(type, kernel, stride, pad));
+}
+
+}
 }
--- a/modules/dnn/src/layers/pooling_layer.hpp
+++ b/modules/dnn/src/layers/pooling_layer.hpp
@ -1,4 +1,4 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
+/*M///////////////////////////////////////////////////////////////////////////////////////
 //
 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
 //
@ -42,37 +42,39 @@
 #ifndef __OPENCV_DNN_LAYERS_POOLING_LAYER_HPP__
 #define __OPENCV_DNN_LAYERS_POOLING_LAYER_HPP__
 #include "../precomp.hpp"
+#include <opencv2/dnn/all_layers.hpp>

 namespace cv
 {
 namespace dnn
 {
-    class PoolingLayer : public Layer
-    {
-        enum
-        {
-            MAX,
-            AVE,
-            STOCHASTIC
-        };

-        int type;
-        int padH, padW;
-        int strideH, strideW;
-        int kernelH, kernelW;
+class PoolingLayerImpl : public PoolingLayer
+{
+    bool useOpenCL;
+    Size inp, out;
+
+    void computeOutputShape(Size inpSz);
+
+    bool pooling_ocl(const char *kname, const Blob &src, Blob &dst, Blob *mask = NULL);
+
+    void maxPooling(Blob &src, Blob &dst);
+    void maxPooling_cpu(Blob &src, Blob &dst);
+    bool maxPooling_ocl(Blob &src, Blob &dst);
+
+    void avePooling(Blob &src, Blob &dst);
+    void avePooling_cpu(Blob &src, Blob &dst);
+    bool avePooling_ocl(Blob &src, Blob &dst);
+
+public:

-        int inpH, inpW;
-        int outH, outW;
+    PoolingLayerImpl();
+    PoolingLayerImpl(int type, Size kernel, Size stride, Size pad);

-        void computeOutputShape(int inpH, int inpW);
-        void maxPooling(Blob &input, Blob &output);
-        void avePooling(Blob &input, Blob &output);
+    void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
+    void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
+};

-    public:
-        PoolingLayer(LayerParams &params);
-        void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-        void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-    };
 }
 }
 #endif
--- a/modules/dnn/src/layers/recurrent_layers.cpp
+++ b/modules/dnn/src/layers/recurrent_layers.cpp
@ -0,0 +1,440 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "../precomp.hpp"
+#include "recurrent_layers.hpp"
+#include "op_blas.hpp"
+#include <iostream>
+#include <cmath>
+
+namespace cv
+{
+namespace dnn
+{
+
+template<typename Dtype>
+static void tanh(const Mat &src, Mat &dst)
+{
+    MatConstIterator_<Dtype> itSrc = src.begin<Dtype>();
+    MatIterator_<Dtype> itDst = dst.begin<Dtype>();
+
+    for (; itSrc != src.end<Dtype>(); itSrc++, itDst++)
+        *itDst = std::tanh(*itSrc);
+}
+
+static void tanh(const Mat &src, Mat &dst)
+{
+    dst.create(src.dims, (const int*)src.size, src.type());
+
+    if (src.type() == CV_32F)
+        tanh<float>(src, dst);
+    else if (src.type() == CV_64F)
+        tanh<double>(src, dst);
+    else
+        CV_Error(Error::StsUnsupportedFormat, "Function supports only floating point types");
+}
+
+static void sigmoid(const Mat &src, Mat &dst)
+{
+    cv::exp(-src, dst);
+    cv::pow(1 + dst, -1, dst);
+}
+
+class LSTMLayerImpl : public LSTMLayer
+{
+    int numOut, numTimeStamps, numSamples, numInp;
+    Mat hInternal, cInternal;
+    Mat gates, dummyOnes;
+    int dtype;
+    bool allocated;
+
+    BlobShape outTailShape;                 //shape of single output sample
+    BlobShape outTsMatShape, outTsShape;    //shape of N output samples
+    BlobShape outResShape;                  //shape of T timestamps and N output samples
+
+    bool useTimestampDim;
+    bool produceCellOutput;
+
+public:
+
+    LSTMLayerImpl()
+    {
+        type = "LSTM";
+        useTimestampDim = true;
+        produceCellOutput = false;
+        allocated = false;
+        outTailShape = BlobShape::empty();
+    }
+
+    void setUseTimstampsDim(bool use)
+    {
+        CV_Assert(!allocated);
+        useTimestampDim = use;
+    }
+
+    void setProduceCellOutput(bool produce)
+    {
+        CV_Assert(!allocated);
+        produceCellOutput = produce;
+    }
+
+    void setC(const Blob &C)
+    {
+        CV_Assert(cInternal.empty() || C.total() == cInternal.total());
+        if (!cInternal.empty())
+            C.reshaped(BlobShape::like(cInternal)).matRefConst().copyTo(cInternal);
+        else
+            C.matRefConst().copyTo(cInternal);
+    }
+
+    void setH(const Blob &H)
+    {
+        CV_Assert(hInternal.empty() || H.total() == hInternal.total());
+        if (!hInternal.empty())
+            H.reshaped(BlobShape::like(hInternal)).matRefConst().copyTo(hInternal);
+        else
+            H.matRefConst().copyTo(hInternal);
+    }
+
+    Blob getC() const
+    {
+        CV_Assert(!cInternal.empty());
+
+        //TODO: add convinient Mat -> Blob constructor
+        Blob res(outTsShape, cInternal.type());
+        res.fill(res.shape(), res.type(), cInternal.data);
+        return res;
+    }
+
+    Blob getH() const
+    {
+        CV_Assert(!hInternal.empty());
+
+        Blob res(outTsShape, hInternal.type());
+        res.fill(res.shape(), res.type(), hInternal.data);
+        return res;
+    }
+
+    void setOutShape(const BlobShape &outTailShape_)
+    {
+        CV_Assert(!allocated || outTailShape_.total() == outTailShape.total());
+        outTailShape = outTailShape_;
+    }
+
+    void setWeights(const Blob &Wh, const Blob &Wx, const Blob &bias)
+    {
+        CV_Assert(Wh.dims() == 2 && Wx.dims() == 2);
+        CV_Assert(Wh.size(0) == Wx.size(0));
+        CV_Assert(Wh.size(0) == 4*Wh.size(1));
+        CV_Assert(Wh.size(0) == (int)bias.total());
+        CV_Assert(Wh.type() == Wx.type() && Wx.type() == bias.type());
+
+        blobs.resize(3);
+        blobs[0] = Wh;
+        blobs[1] = Wx;
+        blobs[2] = bias;
+        blobs[2].reshape(BlobShape(1, (int)bias.total()));
+    }
+
+    void allocate(const std::vector<Blob*> &input, std::vector<Blob> &output)
+    {
+        CV_Assert(blobs.size() == 3);
+        CV_Assert(input.size() == 1);
+
+        Blob &Wh = blobs[0], &Wx = blobs[1];
+        numOut = Wh.size(1);
+        numInp = Wx.size(1);
+
+        if (!outTailShape.isEmpty())
+            CV_Assert(outTailShape.total() == numOut);
+        else
+            outTailShape = BlobShape(numOut);
+
+        if (useTimestampDim)
+        {
+            CV_Assert(input[0]->dims() >= 2 && (int)input[0]->total(2) == numInp);
+            numTimeStamps = input[0]->size(0);
+            numSamples = input[0]->size(1);
+            outResShape = BlobShape(numTimeStamps, numSamples) + outTailShape;
+        }
+        else
+        {
+            CV_Assert(input[0]->dims() >= 1 && (int)input[0]->total(1) == numInp);
+            numTimeStamps = 1;
+            numSamples = input[0]->size(0);
+            outResShape = BlobShape(numSamples) + outTailShape;
+        }
+        outTsMatShape = BlobShape(numSamples, numOut);
+        outTsShape = BlobShape(numSamples) + outTailShape;
+
+        dtype = input[0]->type();
+        CV_Assert(dtype == CV_32F || dtype == CV_64F);
+        CV_Assert(Wh.type() == dtype);
+
+        output.resize( (produceCellOutput) ? 2 : 1 );
+        output[0].create(outResShape, dtype);
+        if (produceCellOutput)
+            output[1].create(outResShape, dtype);
+
+        if (hInternal.empty())
+        {
+            hInternal.create(outTsMatShape.dims(), outTsMatShape.ptr(), dtype);
+            hInternal.setTo(0);
+        }
+        else
+        {
+            CV_Assert((int)hInternal.total() == numSamples*numOut);
+            hInternal = hInternal.reshape(1, outTsMatShape.dims(), outTsMatShape.ptr());
+        }
+
+        if (cInternal.empty())
+        {
+            cInternal.create(outTsMatShape.dims(), outTsMatShape.ptr(), dtype);
+            cInternal.setTo(0);
+        }
+        else
+        {
+            CV_Assert((int)cInternal.total() == numSamples*numOut);
+            cInternal = cInternal.reshape(1, outTsMatShape.dims(), outTsMatShape.ptr());
+        }
+
+        gates.create(numSamples, 4*numOut, dtype);
+
+        dummyOnes.create(numSamples, 1, dtype);
+        dummyOnes.setTo(1);
+
+        allocated = true;
+    }
+
+    void forward(std::vector<Blob*> &input, std::vector<Blob> &output)
+    {
+        const Mat &Wh = blobs[0].matRefConst();
+        const Mat &Wx = blobs[1].matRefConst();
+        const Mat &bias = blobs[2].matRefConst();
+
+        int numSamplesTotal = numTimeStamps*numSamples;
+        Mat xTs = input[0]->reshaped(BlobShape(numSamplesTotal, numInp)).matRefConst();
+
+        BlobShape outMatShape(numSamplesTotal, numOut);
+        Mat hOutTs = output[0].reshaped(outMatShape).matRef();
+        Mat cOutTs = (produceCellOutput) ? output[1].reshaped(outMatShape).matRef() : Mat();
+
+        for (int ts = 0; ts < numTimeStamps; ts++)
+        {
+            Range curRowRange(ts*numSamples, (ts + 1)*numSamples);
+            Mat xCurr = xTs.rowRange(curRowRange);
+
+            gemmCPU(xCurr, Wx, 1, gates, 0, GEMM_2_T);      // Wx * x_t
+            gemmCPU(hInternal, Wh, 1, gates, 1, GEMM_2_T);  //+Wh * h_{t-1}
+            gemmCPU(dummyOnes, bias, 1, gates, 1);          //+b
+
+            Mat getesIFO = gates.colRange(0, 3*numOut);
+            Mat gateI = gates.colRange(0*numOut, 1*numOut);
+            Mat gateF = gates.colRange(1*numOut, 2*numOut);
+            Mat gateO = gates.colRange(2*numOut, 3*numOut);
+            Mat gateG = gates.colRange(3*numOut, 4*numOut);
+
+            sigmoid(getesIFO, getesIFO);
+            tanh(gateG, gateG);
+
+            //compute c_t
+            cv::multiply(gateF, cInternal, gateF);  // f_t (*) c_{t-1}
+            cv::multiply(gateI, gateG, gateI);      // i_t (*) g_t
+            cv::add(gateF, gateI, cInternal);       // c_t = f_t (*) c_{t-1} + i_t (*) g_t
+
+            //compute h_t
+            tanh(cInternal, hInternal);
+            cv::multiply(gateO, hInternal, hInternal);
+
+            //save results in output blobs
+            hInternal.copyTo(hOutTs.rowRange(curRowRange));
+            if (produceCellOutput)
+                cInternal.copyTo(cOutTs.rowRange(curRowRange));
+        }
+    }
+};
+
+Ptr<LSTMLayer> LSTMLayer::create()
+{
+    return Ptr<LSTMLayer>(new LSTMLayerImpl());
+}
+
+void LSTMLayer::forward(std::vector<Blob*>&, std::vector<Blob>&)
+{
+    CV_Error(Error::StsInternal, "This function should be unreached");
+}
+
+int LSTMLayer::inputNameToIndex(String inputName)
+{
+    if (inputName.toLowerCase() == "x")
+        return 0;
+    return -1;
+}
+
+int LSTMLayer::outputNameToIndex(String outputName)
+{
+    if (outputName.toLowerCase() == "h")
+        return 0;
+    else if (outputName.toLowerCase() == "c")
+        return 1;
+    return -1;
+}
+
+
+class RNNLayerImpl : public RNNLayer
+{
+    int numX, numH, numO;
+    int numSamples, numTimestamps, numSamplesTotal;
+    int dtype;
+    Mat Whh, Wxh, bh;
+    Mat Who, bo;
+    Mat hCurr, hPrev, dummyBiasOnes;
+    bool produceH;
+
+public:
+
+    RNNLayerImpl()
+    {
+        type = "RNN";
+        produceH = false;
+    }
+
+    void setProduceHiddenOutput(bool produce = false)
+    {
+        produceH = produce;
+    }
+
+    void setWeights(const Blob &W_xh, const Blob &b_h, const Blob &W_hh, const Blob &W_ho, const Blob &b_o)
+    {
+        CV_Assert(W_hh.dims() == 2 && W_xh.dims() == 2);
+        CV_Assert(W_hh.size(0) == W_xh.size(0) && W_hh.size(0) == W_hh.size(1) && (int)b_h.total() == W_xh.size(0));
+        CV_Assert(W_ho.size(0) == (int)b_o.total());
+        CV_Assert(W_ho.size(1) == W_hh.size(1));
+
+        blobs.resize(5);
+        blobs[0] = W_xh;
+        blobs[1] = b_h;
+        blobs[2] = W_hh;
+        blobs[3] = W_ho;
+        blobs[4] = b_o;
+    }
+
+    void allocate(const std::vector<Blob*> &input, std::vector<Blob> &output)
+    {
+        CV_Assert(input.size() >= 1 && input.size() <= 2);
+
+        Wxh = blobs[0].matRefConst();
+        bh  = blobs[1].matRefConst();
+        Whh = blobs[2].matRefConst();
+        Who = blobs[3].matRefConst();
+        bo  = blobs[4].matRefConst();
+
+        numH = Wxh.rows;
+        numX = Wxh.cols;
+        numO = Who.rows;
+
+        CV_Assert(input[0]->dims() >= 2);
+        CV_Assert((int)input[0]->total(2) == numX);
+        CV_Assert(input[0]->type() == CV_32F || input[0]->type() == CV_64F);
+        dtype = input[0]->type();
+        numTimestamps = input[0]->size(0);
+        numSamples = input[0]->size(1);
+        numSamplesTotal = numTimestamps * numSamples;
+
+        hCurr.create(numSamples, numH, dtype);
+        hPrev.create(numSamples, numH, dtype);
+        hPrev.setTo(0);
+
+        dummyBiasOnes.create(numSamples, 1, dtype);
+        dummyBiasOnes.setTo(1);
+        bh = bh.reshape(1, 1); //is 1 x numH Mat
+        bo = bo.reshape(1, 1); //is 1 x numO Mat
+
+        reshapeOutput(output);
+    }
+
+    void reshapeOutput(std::vector<Blob> &output)
+    {
+        output.resize((produceH) ? 2 : 1);
+        output[0].create(BlobShape(numTimestamps, numSamples, numO), dtype);
+        if (produceH)
+            output[1].create(BlobShape(numTimestamps, numSamples, numH), dtype);
+    }
+
+    void forward(std::vector<Blob*> &input, std::vector<Blob> &output)
+    {
+        Mat xTs = input[0]->reshaped(BlobShape(numSamplesTotal, numX)).matRefConst();
+        Mat oTs = output[0].reshaped(BlobShape(numSamplesTotal, numO)).matRef();
+        Mat hTs = (produceH) ? output[1].reshaped(BlobShape(numSamplesTotal, numH)).matRef() : Mat();
+
+        for (int ts = 0; ts < numTimestamps; ts++)
+        {
+            Range curRowRange = Range(ts * numSamples, (ts + 1) * numSamples);
+            Mat xCurr = xTs.rowRange(curRowRange);
+
+            gemmCPU(hPrev, Whh, 1, hCurr, 0, GEMM_2_T); // W_{hh} * h_{prev}
+            gemmCPU(xCurr, Wxh, 1, hCurr, 1, GEMM_2_T); //+W_{xh} * x_{curr}
+            gemmCPU(dummyBiasOnes, bh, 1, hCurr, 1);    //+bh
+            tanh(hCurr, hPrev);
+
+            Mat oCurr = oTs.rowRange(curRowRange);
+            gemmCPU(hPrev, Who, 1, oCurr, 0, GEMM_2_T); // W_{ho} * h_{prev}
+            gemmCPU(dummyBiasOnes, bo, 1, oCurr, 1);    //+b_o
+            tanh(oCurr, oCurr);
+
+            if (produceH)
+                hPrev.copyTo(hTs.rowRange(curRowRange));
+        }
+    }
+};
+
+void RNNLayer::forward(std::vector<Blob*>&, std::vector<Blob>&)
+{
+    CV_Error(Error::StsInternal, "This function should be unreached");
+}
+
+CV_EXPORTS_W Ptr<RNNLayer> RNNLayer::create()
+{
+    return Ptr<RNNLayer>(new RNNLayerImpl());
+}
+
+}
+}
--- a/modules/dnn/src/layers/recurrent_layers.hpp
+++ b/modules/dnn/src/layers/recurrent_layers.hpp
@ -0,0 +1,54 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_DNN_LAYERS_RECURRENT_LAYERS_HPP__
+#define __OPENCV_DNN_LAYERS_RECURRENT_LAYERS_HPP__
+#include "../precomp.hpp"
+#include <opencv2/dnn/all_layers.hpp>
+
+namespace cv
+{
+namespace dnn
+{
+
+}
+}
+#endif
--- a/modules/dnn/src/layers/reshape_layer.cpp
+++ b/modules/dnn/src/layers/reshape_layer.cpp
@ -42,125 +42,46 @@
 #include "../precomp.hpp"
 #include "layers_common.hpp"
 #include "reshape_layer.hpp"
+#include <opencv2/dnn/shape_utils.hpp>

 namespace cv
 {
 namespace dnn
 {

-ReshapeLayer::ReshapeLayer(LayerParams &params) : Layer(params)
+ReshapeLayerImpl::ReshapeLayerImpl(const BlobShape &newShape_, Range applyingRange_)
 {
-    inAxis = params.get<int>("axis", 0);
-    inNumAxes = params.get<int>("num_axes", -1);
-    CV_Assert(inNumAxes >= -1);
-
-    autoAxisIdx = -1;
-
-    if (!params.has("dim"))
-    {
-        shapeDesc = BlobShape(0);
-        return;
-    }
-
-    DictValue paramShape = params.get("dim");
-    shapeDesc = BlobShape(paramShape.size());
-
-    for (int i = 0; i < paramShape.size(); i++)
-    {
-        int dim = paramShape.get<int>(i);
-        CV_Assert(dim >= -1);
-
-        if (dim == -1)
-        {
-            if (autoAxisIdx != -1)
-                CV_Error(Error::StsBadArg, "New shape contains multiple -1 dims");
-            autoAxisIdx = i;
-        }
-
-        shapeDesc[i] = dim;
-    }
+    newShapeDesc = newShape_;
+    newShapeRange = applyingRange_;
 }

-void ReshapeLayer::allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
+void ReshapeLayerImpl::allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
 {
    outputs.resize(inputs.size());
+    outShapes.resize(inputs.size());

    for (size_t i = 0; i < inputs.size(); i++)
    {
-        Blob &inpBlob = *inputs[i];
-        Blob &outBlob = outputs[i];
-        BlobShape inpShape = inpBlob.shape();
-
-        int startAxis = (inAxis >= 0) ? inAxis : inpShape.dims() + 1 + inAxis;
-        int endAxis = (inNumAxes == -1) ? inpShape.dims() : startAxis + inNumAxes;
-        CV_Assert(0 <= startAxis && startAxis <= inpShape.dims());
-        CV_Assert(0 <= endAxis && endAxis <= inpShape.dims());
-
-        int newDims = inpShape.dims() - (endAxis - startAxis) + shapeDesc.dims();
-        BlobShape outShape(newDims);
-
-        computeOutputShape(startAxis, endAxis, inpShape, outShape);
-
-        outBlob.shareFrom(inpBlob);
-        outBlob.reshape(outShape);
+        outShapes[i] = computeShapeByReshapeMask(inputs[i]->shape(), newShapeDesc, newShapeRange);
+        outputs[i].shareFrom(*inputs[i]);
+        outputs[i].reshape(outShapes[i]);
    }
 }

-void ReshapeLayer::computeOutputShape(int startAxis, int endAxis, BlobShape &inpShape, BlobShape &outShape)
+void ReshapeLayerImpl::forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
 {
-    int idx = 0;
-    for (int i = 0; i < startAxis; i++)
-        outShape[idx++] = inpShape[i];
-
-    for (int i = 0; i < shapeDesc.dims(); i++)
-    {
-        if (shapeDesc[i] == 0)
-        {
-            int inpAxisIdx = startAxis + i;
-            if (inpAxisIdx < 0 || inpShape.dims() <= inpAxisIdx)
-                CV_Error(Error::StsOutOfRange, "copy dimension (which has zero size) is not presented into reshaped blob");
-            outShape[idx++] = inpShape[startAxis + i];
-        }
-        else
-        {
-            outShape[idx++] = (shapeDesc[i] > 0) ? shapeDesc[i] : 1;
-        }
-    }
-
-    for (int i = endAxis; i < inpShape.dims(); i++)
-        outShape[idx++] = inpShape[i];
-
-    if (autoAxisIdx >= 0)
-    {
-        size_t total = inpShape.total();
-        size_t curTotal = 1;
-        for (int i = 0; i < outShape.dims(); i++)
-        {
-            if (i != startAxis + autoAxisIdx)
-                curTotal *= outShape[i];
-        }
-
-        CV_DbgAssert(curTotal <= total && total % curTotal == 0);
-
-        outShape[startAxis + autoAxisIdx] = (int)(total / curTotal);
-    }
-
-    if (inpShape.total() != outShape.total())
+    for (size_t i = 0; i < outputs.size(); i++)
    {
-        CV_Error(Error::StsUnmatchedSizes, "Mismatch between input and output blob elements count");
+        outputs[i].shareFrom(*inputs[i]);
+        outputs[i].reshape(outShapes[i]);
    }
 }

-
-Ptr<Layer> createFlattenLayer(LayerParams&)
+Ptr<ReshapeLayer> ReshapeLayer::create(const BlobShape &newShape, Range applyingRange /*= Range::all()*/)
 {
-    LayerParams params;
-
-    int shapeDesc[] = {0, -1};
-    params.set("dim", DictValue::arrayInt(shapeDesc, 2));
-
-    return Ptr<Layer>(new ReshapeLayer(params));
+    return Ptr<ReshapeLayer>(new ReshapeLayerImpl(newShape, applyingRange));
 }

+
 }
 }
--- a/modules/dnn/src/layers/reshape_layer.hpp
+++ b/modules/dnn/src/layers/reshape_layer.hpp
@ -42,26 +42,23 @@
 #ifndef __OPENCV_DNN_LAYERS_RESHAPE_LAYER_HPP__
 #define __OPENCV_DNN_LAYERS_RESHAPE_LAYER_HPP__
 #include "../precomp.hpp"
+#include <opencv2/dnn/all_layers.hpp>

 namespace cv
 {
 namespace dnn
 {

-class ReshapeLayer : public Layer
+class ReshapeLayerImpl : public ReshapeLayer
 {
+    std::vector<BlobShape> outShapes;
+
 public:
-    ReshapeLayer(LayerParams &params);
+    ReshapeLayerImpl(const BlobShape &newShape_, Range applyingRange_);

    void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs);

-    void forward(std::vector<Blob*>&, std::vector<Blob>&) {}
-
-protected:
-    BlobShape shapeDesc;
-    int inAxis, inNumAxes, autoAxisIdx;
-
-    void computeOutputShape(int startAxis, int endAxis, BlobShape &inpShape, BlobShape &outShape);
+    void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
 };

 Ptr<Layer> createFlattenLayer(LayerParams&);
--- a/modules/dnn/src/layers/slice_layer.cpp
+++ b/modules/dnn/src/layers/slice_layer.cpp
@ -42,55 +42,57 @@
 #include "../precomp.hpp"
 #include "layers_common.hpp"
 #include "slice_layer.hpp"
+#include <opencv2/core/ocl.hpp>
+#include <opencv2/dnn/shape_utils.hpp>

 namespace cv
 {
 namespace dnn
 {

-SliceLayer::SliceLayer(LayerParams &params) : Layer(params)
+SliceLayerImpl::SliceLayerImpl(int axis_ /*= 1*/)
 {
-    inAxis = params.get<int>("axis", 1);
-
-    if (!params.has("slice_point"))
-        return;
+    axis = axis_;
+}

-    const DictValue &_slicePoints = params.get("slice_point");
-    slicePoints.resize(_slicePoints.size());
-    for (int i = 0; i < _slicePoints.size(); i++)
-    {
-        slicePoints[i] = _slicePoints.get<int>(i);
-        CV_Assert(slicePoints[i] > 0 && (i == 0 || slicePoints[i-1] < slicePoints[i]));
-    }
+SliceLayerImpl::SliceLayerImpl(int axis_, const std::vector<int> &sliceIndices_)
+{
+    axis = axis_;
+    sliceIndices = sliceIndices_;
 }

-void SliceLayer::allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
+void SliceLayerImpl::allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
 {
    CV_Assert(inputs.size() == 1);

-    const Blob inpBlob = *inputs[0];
-    int axis = inpBlob.canonicalAxis(inAxis);
-    int axisSize = inpBlob.size(axis);
+    const Blob &inpBlob = *inputs[0];
+    useOpenCL = ocl::useOpenCL() && inpBlob.getState() == Blob::HEAD_AT_UMAT;
+
+    axisIdx = inpBlob.canonicalAxis(axis);
+    int axisSize = inpBlob.size(axisIdx);
    BlobShape inpShape = inpBlob.shape();
+    int allocFlags = useOpenCL ? Blob::ALLOC_UMAT : Blob::ALLOC_MAT;

-    if (slicePoints.size()) //divide blob with respect to passed parameters
+    if (sliceIndices.size()) //divide blob with respect to passed parameters
    {
        std::vector<int> outAxisSize;
        int prevSlice = 0;

-        for (size_t i = 0; i < slicePoints.size(); i++)
+        for (size_t i = 0; i < sliceIndices.size(); i++)
        {
-            CV_Assert(prevSlice < slicePoints[i] && slicePoints[i] < axisSize);
-            outAxisSize.push_back(slicePoints[i] - prevSlice);
-            prevSlice = slicePoints[i];
+            if (!(prevSlice < sliceIndices[i] && sliceIndices[i] < axisSize))
+                CV_Error(Error::StsBadArg, "Slice indices should be positive, increased and don't exceed size of sliced dimension");
+
+            outAxisSize.push_back(sliceIndices[i] - prevSlice);
+            prevSlice = sliceIndices[i];
        }
        outAxisSize.push_back(axisSize - prevSlice);

        outputs.resize(outAxisSize.size());
        for (size_t i = 0; i < outAxisSize.size(); i++)
        {
-            inpShape[axis] = outAxisSize[i];
-            outputs[i].create(inpShape, inpBlob.type());
+            inpShape[axisIdx] = outAxisSize[i];
+            outputs[i].create(inpShape, inpBlob.type(), allocFlags);
        }
    }
    else //divide blob with respect to count of output blobs
@ -100,30 +102,45 @@ void SliceLayer::allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &o

        for (size_t i = 0; i < outputs.size(); i++)
        {
-            inpShape[axis] = outAxisSize;
-            outputs[i].create(inpShape, inpBlob.type());
+            inpShape[axisIdx] = outAxisSize;
+            outputs[i].create(inpShape, inpBlob.type(), allocFlags);
        }
    }
 }

-void SliceLayer::forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
+void SliceLayerImpl::forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
+{
+    #ifdef HAVE_OPENCL
+    if (useOpenCL)
+        forward_<UMat>(inputs, outputs);
+    else
+    #endif
+        forward_<Mat>(inputs, outputs);
+}
+
+template<typename XMat>
+void SliceLayerImpl::forward_(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
 {
-    Blob &inpBlob = *inputs[0];
-    const int axis = inpBlob.canonicalAxis(inAxis);
-    const Mat& inpMat = inpBlob.matRef();
+    const XMat& inpMat = inputs[0]->getRefConst<XMat>();
+    std::vector<Range> ranges(inputs[0]->dims(), Range::all());

-    std::vector<Range> ranges(inpBlob.dims(), Range::all());
-    int sizeStart = 0;
+    ranges[axisIdx].start = 0;
    for (size_t i = 0; i < outputs.size(); i++)
    {
-        int sizeEnd = sizeStart + outputs[i].size(axis);
-        ranges[axis] = Range(sizeStart, sizeEnd);
+        ranges[axisIdx].end = ranges[axisIdx].start + outputs[i].size(axisIdx);
+        inpMat(&ranges[0]).copyTo(outputs[i].getRef<XMat>());
+        ranges[axisIdx].start = ranges[axisIdx].end;
+    }
+}

-        Mat inpSubMat = inpMat(&ranges[0]);
-        inpSubMat.copyTo(outputs[i].matRef());
+Ptr<SliceLayer> SliceLayer::create(int axis)
+{
+    return Ptr<SliceLayer>(new SliceLayerImpl(axis));
+}

-        sizeStart = sizeEnd;
-    }
+Ptr<SliceLayer> SliceLayer::create(int axis, const std::vector<int> &sliceIndices)
+{
+    return Ptr<SliceLayer>(new SliceLayerImpl(axis, sliceIndices));
 }

 }
--- a/modules/dnn/src/layers/slice_layer.hpp
+++ b/modules/dnn/src/layers/slice_layer.hpp
@ -42,24 +42,28 @@
 #ifndef __OPENCV_DNN_LAYERS_SLICE_LAYER_HPP__
 #define __OPENCV_DNN_LAYERS_SLICE_LAYER_HPP__
 #include "../precomp.hpp"
+#include <opencv2/dnn/all_layers.hpp>

 namespace cv
 {
 namespace dnn
 {

-class SliceLayer : public Layer
+class SliceLayerImpl : public SliceLayer
 {
+    bool useOpenCL;
+    int axisIdx;
+
+    template<typename XMat>
+    void forward_(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
+
 public:
-    SliceLayer(LayerParams &params);
+    SliceLayerImpl(int axis_ = 1);
+    SliceLayerImpl(int axis_, const std::vector<int> &sliceIndices_);

    void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs);

    void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-
-private:
-    int inAxis;
-    std::vector<int> slicePoints;
 };

 }
--- a/modules/dnn/src/layers/softmax_layer.cpp
+++ b/modules/dnn/src/layers/softmax_layer.cpp
@ -42,6 +42,8 @@
 #include "../precomp.hpp"
 #include "layers_common.hpp"
 #include "softmax_layer.hpp"
+#include <opencv2/core/ocl.hpp>
+#include "modules/dnn/opencl_kernels_dnn.hpp"
 #include <algorithm>
 #include <stdlib.h>
 using std::max;
@ -50,95 +52,173 @@ namespace cv
 {
 namespace dnn
 {
-    //TODO: set default axis number to 1, and add custom shape length in FullyConnected
-    SoftMaxLayer::SoftMaxLayer(LayerParams &params) : Layer(params)
+
+SoftMaxLayerImpl::SoftMaxLayerImpl(int axis)
+{
+    axisRaw = axis;
+}
+
+void SoftMaxLayerImpl::allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
+{
+    CV_Assert(inputs.size() == 1);
+    axis = inputs[0]->canonicalAxis(axisRaw);
+
+    useOpenCL = ocl::useOpenCL();
+
+    BlobShape shape = inputs[0]->shape();
+    outerSize = shape.total(0, axis);
+    channels = shape[axis];
+    innerSize = shape.total(axis + 1);
+
+    int allocFlag = (useOpenCL) ? Blob::ALLOC_UMAT : Blob::ALLOC_MAT;
+    shape[axis] = 1;
+    buf.create(shape, inputs[0]->type(), allocFlag);
+
+    outputs.resize(1);
+    outputs[0].create(inputs[0]->shape(), inputs[0]->type(), allocFlag);
+}
+
+void SoftMaxLayerImpl::forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
+{
+    Blob &src = *inputs[0];
+    Blob &dst = outputs[0];
+
+    if (!useOpenCL)
+        forward_cpu(src, dst);
+    else
    {
-        //hotfix!!!
-        axis_ = params.get<int>("axis", 1);
+        CV_Assert(forward_ocl(src, dst));
    }
+}

-    void SoftMaxLayer::allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
-    {
-        CV_Assert(inputs.size() == 1);
-        axis = inputs[0]->canonicalAxis(axis_);
+#ifdef HAVE_OPENCL
+bool SoftMaxLayerImpl::forward_ocl(Blob &src, Blob &dst)
+{
+    const UMat &srcMat = src.umatRefConst();
+    UMat &dstMat = dst.umatRef();
+    srcMat.copyTo(dstMat);
+    UMat &bufMat = buf.umatRef();
+    CV_Assert(dstMat.offset == 0);

-        BlobShape shape = inputs[0]->shape();
-        outputs.resize(1);
-        outputs[0].create(shape);
+    String buildOpts = String("-DT=") + ocl::typeToStr(src.type());
+    ocl::Kernel kmax, ksub, ksum, kdiv;

-        shape[axis] = 1;
-        maxAggregator.create(shape);
-    }
+    if (!kmax.create("kernel_channel_max", ocl::dnn::softmax_oclsrc, buildOpts))
+        return false;

-    void SoftMaxLayer::forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
-    {
-        Blob &src = *inputs[0];
-        Blob &dst = outputs[0];
+    if (!ksub.create("kernel_channel_subtract", ocl::dnn::softmax_oclsrc, buildOpts))
+        return false;

-        float *srcPtr = src.ptrf();
-        float *dstPtr = dst.ptrf();
-        float *bufPtr = maxAggregator.ptrf();
+    if (!ksum.create("kernel_channel_sum", ocl::dnn::softmax_oclsrc, buildOpts))
+        return false;

-        size_t outerSize = src.total(0, axis);
-        size_t channels = src.size(axis);
-        size_t innerSize = src.total(axis + 1);
+    if (!kdiv.create("kernel_channel_div", ocl::dnn::softmax_oclsrc, buildOpts))
+        return false;

-        size_t outerStep = src.total(axis);
-        size_t cnStep = src.total(axis + 1);
+    size_t wgSize = ocl::Device::getDefault().maxWorkGroupSize();
+    size_t bufSize = buf.total();
+    size_t totalSize = src.total();

-        //compute max along axis
-        for (size_t outerDim = 0; outerDim < outerSize; outerDim++)
-        {
-            size_t srcOffset = outerDim * outerStep;
-            size_t bufOffset = outerDim * cnStep;
+    kmax.args((int)outerSize, (int)channels, (int)innerSize,
+              ocl::KernelArg::PtrReadOnly(dstMat), ocl::KernelArg::PtrReadWrite(bufMat));
+    if (!kmax.run(1, &bufSize, &wgSize, true))
+        return false;
+
+    ksub.args((int)totalSize, (int)outerSize, (int)channels, (int)innerSize,
+              ocl::KernelArg::PtrReadOnly(bufMat), ocl::KernelArg::PtrReadWrite(dstMat));
+    if (!ksub.run(1, &totalSize, &wgSize, true))
+        return false;
+
+    cv::exp(dstMat, dstMat);
+
+    ksum.args((int)outerSize, (int)channels, (int)innerSize,
+              ocl::KernelArg::PtrReadOnly(dstMat), ocl::KernelArg::PtrReadWrite(bufMat));
+    if (!ksum.run(1, &bufSize, &wgSize, true))
+        return false;
+
+    kdiv.args((int)totalSize, (int)outerSize, (int)channels, (int)innerSize,
+              ocl::KernelArg::PtrReadOnly(bufMat), ocl::KernelArg::PtrReadWrite(dstMat));
+    if (!kdiv.run(1, &totalSize, &wgSize, true))
+        return false;
+
+    return true;
+}
+#else
+bool SoftMaxLayerImpl::forward_ocl(Blob&, Blob&)
+{
+    return false;
+}
+#endif
+
+void SoftMaxLayerImpl::forward_cpu(Blob &src, Blob &dst)
+{
+    CV_Assert(src.type() == CV_32F);

-            memcpy(bufPtr + bufOffset, srcPtr + srcOffset, innerSize * sizeof(float));
+    float *srcPtr = src.ptrf();
+    float *dstPtr = dst.ptrf();
+    float *bufPtr = buf.ptrf();
+
+    size_t outerStep = src.total(axis);
+    size_t cnStep = src.total(axis + 1);
+
+    //compute max along axis
+    for (size_t outerDim = 0; outerDim < outerSize; outerDim++)
+    {
+        size_t srcOffset = outerDim * outerStep;
+        size_t bufOffset = outerDim * cnStep;

-            for (size_t cnDim = 1; cnDim < channels; cnDim++)
-            {
-                for (size_t i = 0; i < innerSize; i++)
-                    bufPtr[bufOffset + i] = std::max(bufPtr[bufOffset + i], srcPtr[srcOffset + cnDim * cnStep + i]);
-            }
+        memcpy(bufPtr + bufOffset, srcPtr + srcOffset, innerSize * sizeof(float));
+
+        for (size_t cnDim = 1; cnDim < channels; cnDim++)
+        {
+            for (size_t i = 0; i < innerSize; i++)
+                bufPtr[bufOffset + i] = std::max(bufPtr[bufOffset + i], srcPtr[srcOffset + cnDim * cnStep + i]);
        }
+    }
+
+    //subtract max
+    for (size_t outerDim = 0; outerDim < outerSize; outerDim++)
+    {
+        size_t srcOffset = outerDim * outerStep;
+        size_t bufOffset = outerDim * cnStep;

-        //subtract max
-        for (size_t outerDim = 0; outerDim < outerSize; outerDim++)
+        for (size_t cnDim = 0; cnDim < channels; cnDim++)
        {
-            size_t srcOffset = outerDim * outerStep;
-            size_t bufOffset = outerDim * cnStep;
-
-            for (size_t cnDim = 0; cnDim < channels; cnDim++)
-            {
-                for (size_t i = 0; i < innerSize; i++)
-                    dstPtr[srcOffset + cnDim * cnStep + i] = srcPtr[srcOffset + cnDim * cnStep + i] - bufPtr[bufOffset + i];
-            }
+            for (size_t i = 0; i < innerSize; i++)
+                dstPtr[srcOffset + cnDim * cnStep + i] = srcPtr[srcOffset + cnDim * cnStep + i] - bufPtr[bufOffset + i];
        }
+    }
+
+    cv::exp(dst.matRef(), dst.matRef());

-        cv::exp(dst.matRef(), dst.matRef());
+    for (size_t outerDim = 0; outerDim < outerSize; outerDim++)
+    {
+        size_t srcOffset = outerDim * outerStep;
+        size_t bufOffset = outerDim * cnStep;
+
+        //sum exp along axis
+        for (size_t i = 0; i < innerSize; i++)
+            bufPtr[bufOffset + i] = 0.f;

-        for (size_t outerDim = 0; outerDim < outerSize; outerDim++)
+        for (size_t cnDim = 0; cnDim < channels; cnDim++)
        {
-            size_t srcOffset = outerDim * outerStep;
-            size_t bufOffset = outerDim * cnStep;
+            for (size_t i = 0; i < innerSize; i++)
+                bufPtr[bufOffset + i] += dstPtr[srcOffset + cnDim * cnStep + i];
+        }

-            //sum exp along axis
+        //divide by computed sum
+        for (size_t cnDim = 0; cnDim < channels; cnDim++)
+        {
            for (size_t i = 0; i < innerSize; i++)
-                bufPtr[bufOffset + i] = 0.f;
-
-            for (size_t cnDim = 0; cnDim < channels; cnDim++)
-            {
-                for (size_t i = 0; i < innerSize; i++)
-                    bufPtr[bufOffset + i] += dstPtr[srcOffset + cnDim * cnStep + i];
-            }
-
-            //divide by computed sum
-            for (size_t cnDim = 0; cnDim < channels; cnDim++)
-            {
-                for (size_t i = 0; i < innerSize; i++)
-                    dstPtr[srcOffset + cnDim * cnStep + i] /= bufPtr[bufOffset + i];
-            }
+                dstPtr[srcOffset + cnDim * cnStep + i] /= bufPtr[bufOffset + i];
        }
    }
+}
+
+Ptr<SoftmaxLayer> SoftmaxLayer::create(int axis)
+{
+    return Ptr<SoftmaxLayer>(new SoftMaxLayerImpl(axis));
+}

 }
 }
--- a/modules/dnn/src/layers/softmax_layer.hpp
+++ b/modules/dnn/src/layers/softmax_layer.hpp
@ -42,21 +42,31 @@
 #ifndef __OPENCV_DNN_LAYERS_SOFTMAX_LAYER_HPP__
 #define __OPENCV_DNN_LAYERS_SOFTMAX_LAYER_HPP__
 #include "../precomp.hpp"
+#include <opencv2/dnn/all_layers.hpp>

 namespace cv
 {
 namespace dnn
 {
-    class SoftMaxLayer : public Layer
-    {
-        int axis_, axis;
-        Blob maxAggregator;
-
-    public:
-        SoftMaxLayer(LayerParams &params);
-        void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-        void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-    };
+
+class SoftMaxLayerImpl : public SoftmaxLayer
+{
+    int axis, axisRaw;
+    Blob buf;
+    bool useOpenCL;
+    size_t outerSize, channels, innerSize;
+
+
+    bool forward_ocl(Blob &src, Blob &dst);
+    void forward_cpu(Blob &src, Blob &dst);
+
+public:
+
+    SoftMaxLayerImpl(int axis = 1);
+    void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
+    void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
+};
+
 }
 }
 #endif
--- a/modules/dnn/src/layers/split_layer.cpp
+++ b/modules/dnn/src/layers/split_layer.cpp
@ -42,41 +42,46 @@
 #include "../precomp.hpp"
 #include "layers_common.hpp"
 #include "split_layer.hpp"
+#include <opencv2/core/ocl.hpp>

 namespace cv
 {
 namespace dnn
 {

-//TODO: maybe "top_count" param is useless because it can be determined by output connections number?
-SplitLayer::SplitLayer(LayerParams &params) : Layer(params)
+SplitLayerImpl::SplitLayerImpl(int outputsCount_ /*= -1*/)
 {
-    if (params.has("top_count"))
-    {
-        outputsNum = params.get<int>("top_count");
-        CV_Assert(outputsNum >= 0);
-    }
-    else
-    {
-        outputsNum = -1;
-    }
+    outputsCount = outputsCount_;
 }

-void SplitLayer::allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
+void SplitLayerImpl::allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
 {
    CV_Assert(inputs.size() == 1);
+    useOpenCL = ocl::useOpenCL() && inputs[0]->getState() == Blob::HEAD_AT_UMAT;
+    int allocFlags = useOpenCL ? Blob::ALLOC_UMAT : Blob::ALLOC_MAT;

-    if (outputsNum >= 0)
-        outputs.resize(outputsNum);
+    if (outputsCount >= 0)
+        outputs.resize(outputsCount);

    for (size_t i = 0; i < outputs.size(); i++)
-        outputs[i].create(inputs[0]->shape(), inputs[0]->type());
+        outputs[i].create(inputs[0]->shape(), inputs[0]->type(), allocFlags);
 }

-void SplitLayer::forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
+void SplitLayerImpl::forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
 {
    for (size_t i = 0; i < outputs.size(); i++)
-        inputs[0]->matRefConst().copyTo(outputs[i].matRef());
+    {
+        if (useOpenCL)
+            inputs[0]->umatRefConst().copyTo(outputs[i].umatRef());
+        else
+            inputs[0]->matRefConst().copyTo(outputs[i].matRef());
+    }
+}
+
+
+Ptr<SplitLayer> SplitLayer::create(int outputsCount)
+{
+    return Ptr<SplitLayer>(new SplitLayerImpl(outputsCount));
 }

 }
--- a/modules/dnn/src/layers/split_layer.hpp
+++ b/modules/dnn/src/layers/split_layer.hpp
@ -42,23 +42,23 @@
 #ifndef __OPENCV_DNN_LAYERS_SPLIT_LAYER_HPP__
 #define __OPENCV_DNN_LAYERS_SPLIT_LAYER_HPP__
 #include "../precomp.hpp"
+#include <opencv2/dnn/all_layers.hpp>

 namespace cv
 {
 namespace dnn
 {

-class SplitLayer : public Layer
+class SplitLayerImpl : public SplitLayer
 {
+    bool useOpenCL;
+
 public:
-    SplitLayer(LayerParams &params);
+    SplitLayerImpl(int outputsCount_ = -1);

    void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs);

    void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-
-private:
-    int outputsNum;
 };

 }
--- a/modules/dnn/src/opencl/activations.cl
+++ b/modules/dnn/src/opencl/activations.cl
@ -0,0 +1,44 @@
+__kernel void ReLUForward(const int count, __global const T* in, __global T* out
+#ifndef RELU_NO_SLOPE
+, T negative_slope
+#endif
+) {
+  int index = get_global_id(0);
+  if(index < count)
+#ifndef RELU_NO_SLOPE
+  out[index] = in[index] > 0 ? in[index] : in[index] * negative_slope;
+#else
+  out[index] = in[index] > 0 ? in[index] : 0;
+#endif
+}
+
+__kernel void TanHForward(const int count, __global T* in, __global T* out) {
+  int index = get_global_id(0);
+  if(index < count)
+  out[index] = tanh(in[index]);
+}
+
+__kernel void SigmoidForward(const int count, __global const T* in, __global T* out) {
+  int index = get_global_id(0);
+  if(index < count)
+  out[index] = 1. / (1. + exp(-in[index]));
+}
+
+__kernel void BNLLForward(const int n, __global const T* in, __global T* out) {
+  int index = get_global_id(0);
+  if (index < n) {
+    out[index] = in[index] > 0 ? in[index] + log(1. + exp(-in[index])) : log(1. + exp(in[index]));
+  }
+}
+
+__kernel void AbsValForward(const int n, __global const T* in, __global T* out) {
+  int index = get_global_id(0);
+  if (index < n)
+    out[index] = fabs(in[index]);
+}
+
+__kernel void PowForward(const int n, __global const T* in, __global T* out, const T power, const T scale, const T shift) {
+  int index = get_global_id(0);
+  if (index < n)
+    out[index] = pow(shift + scale * in[index], power);
+}
--- a/modules/dnn/src/opencl/col2im.cl
+++ b/modules/dnn/src/opencl/col2im.cl
@ -0,0 +1,62 @@
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
+
+__kernel void col2im(const int n, __global const T* data_col, const int col_offset,
+    const int height, const int width, const int channels,
+    const int patch_h, const int patch_w,
+    const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w,
+    const int height_col, const int width_col,
+    __global T* data_im, const int img_offset)
+{
+  data_col = data_col + col_offset;
+  data_im = data_im + img_offset;
+  int index = get_global_id(0);
+  if(index < n) {
+    T val = 0;
+    int w = index % width + pad_w;
+    int h = (index / width) % height + pad_h;
+    int c = index / (width * height);
+
+    // compute the start and end of the output
+    int w_col_start = (w < patch_w) ? 0 : (w - patch_w) / stride_w + 1;
+    int w_col_end = min(w / stride_w + 1, width_col);
+    int h_col_start = (h < patch_h) ? 0 : (h - patch_h) / stride_h + 1;
+    int h_col_end = min(h / stride_h + 1, height_col);
+
+    // equivalent implementation
+    int offset =
+    (c * patch_h * patch_w + h * patch_w + w) * height_col * width_col;
+    int coeff_h_col = (1 - stride_h * patch_w * height_col) * width_col;
+    int coeff_w_col = (1 - stride_w * height_col * width_col);
+    for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
+      for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
+        val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
+      }
+    }
+    data_im[index] = val;
+  }
+}
--- a/modules/dnn/src/opencl/im2col.cl
+++ b/modules/dnn/src/opencl/im2col.cl
@ -39,11 +39,11 @@
 //
 //M*/

-__kernel void im2col(__global const float *im_src, int im_src_offset,
+__kernel void im2col(__global const T *im_src, int im_src_offset,
                     int channels, int height_inp, int width_inp,
                     int kernel_h, int kernel_w, int pad_h, int pad_w, int stride_h, int stride_w,
                     int height_out, int width_out,
-                     __global float *im_col, int im_col_offset
+                     __global T *im_col, int im_col_offset
                    )
 {
    int index = get_global_id(0);
@ -52,13 +52,13 @@ __kernel void im2col(__global const float *im_src, int im_src_offset,
    int j_out = index % width_out;
    int i_out = (index / width_out) % height_out;
    int c_inp = (index / width_out) / height_out;
-    
+
    int c_out = c_inp * kernel_h * kernel_w;
    int i_inp = i_out * stride_h - pad_h;
    int j_inp = j_out * stride_w - pad_w;

-    im_src += (c_inp * height_inp + i_inp) * width_inp + j_inp + im_src_offset / sizeof(float);
-    im_col += (c_out * height_out + i_out) * width_out + j_out + im_col_offset / sizeof(float);
+    im_src += (c_inp * height_inp + i_inp) * width_inp + j_inp + im_src_offset;
+    im_col += (c_out * height_out + i_out) * width_out + j_out + im_col_offset;

    for (int ki = 0; ki < kernel_h; ++ki)
        for (int kj = 0; kj < kernel_w; ++kj) {
--- a/modules/dnn/src/opencl/lrn.cl
+++ b/modules/dnn/src/opencl/lrn.cl
@ -0,0 +1,76 @@
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
+
+__kernel void LRNComputeOutput(const int nthreads, __global T* in, __global T* scale, const T negative_beta, __global T* out) {
+  int index = get_global_id(0);
+  int tmp = get_global_size(0);
+  for(index; index < nthreads; index += tmp)
+  out[index] = in[index] * pow(scale[index], negative_beta);
+}
+
+__kernel void LRNFillScale(const int nthreads, __global T* in, const int num, const int channels, const int height, const int width, const int size, const T alpha_over_size, const T k, __global T* scale) {
+  int index = get_global_id(0);
+  int tmp = get_global_size(0);
+  for(index; index < nthreads; index += tmp) {
+    // find out the local offset
+    const int w = index % width;
+    const int h = (index / width) % height;
+    const int n = index / width / height;
+    const int offset = (n * channels * height + h) * width + w;
+    const int step = height * width;
+    in = in + offset;
+    scale = scale + offset;
+    int head = 0;
+    const int pre_pad = (size - 1) / 2;
+    const int post_pad = size - pre_pad - 1;
+    T accum_scale = 0;
+    // fill the scale at [n, :, h, w]
+    // accumulate values
+    while (head < post_pad && head < channels) {
+      accum_scale += in[head * step] * in[head * step];
+      ++head;
+    }
+    // both add and subtract
+    while (head < channels) {
+      accum_scale += in[head * step] * in[head * step];
+      if (head - size >= 0) {
+        accum_scale -= in[(head - size) * step]
+        * in[(head - size) * step];
+      }
+      scale[(head - post_pad) * step] = k + accum_scale * alpha_over_size;
+      ++head;
+    }
+    // subtract only
+    while (head < channels + post_pad) {
+      if (head - size >= 0) {
+        accum_scale -= in[(head - size) * step]
+        * in[(head - size) * step];
+      }
+      scale[(head - post_pad) * step] = k + accum_scale * alpha_over_size;
+      ++head;
+    }
+  }
+}
--- a/modules/dnn/src/opencl/pooling.cl
+++ b/modules/dnn/src/opencl/pooling.cl
@ -0,0 +1,94 @@
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
+
+__kernel void MaxPoolForward(const int nthreads, __global T* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global T* top_data
+#ifdef MASK
+  , __global int* mask, __global T* top_mask
+#endif
+) {
+  int index = get_global_id(0);
+  int tmp = get_global_size(0);
+  for(index; index < nthreads; index += tmp) {
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+    int hstart = ph * stride_h - pad_h;
+    int wstart = pw * stride_w - pad_w;
+    const int hend = min(hstart + kernel_h, height);
+    const int wend = min(wstart + kernel_w, width);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    T maxval = -FLT_MAX;
+    int maxidx = -1;
+    bottom_data =
+    bottom_data + (n * channels + c) * height * width;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        if (bottom_data[h * width + w] > maxval) {
+          maxidx = h * width + w;
+          maxval = bottom_data[maxidx];
+        }
+      }
+    }
+    top_data[index] = maxval;
+#ifdef MASK
+    if (mask) {
+      mask[index] = maxidx;
+    } else {
+      top_mask[index] = maxidx;
+    }
+#endif
+  }
+}
+
+__kernel void AvePoolForward(const int nthreads, __global T* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w,__global T* top_data) {
+  int index = get_global_id(0);
+  int tmp = get_global_size(0);
+  for(index; index < nthreads; index+=tmp) {
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels; int hstart = ph * stride_h - pad_h; int wstart = pw * stride_w - pad_w;
+    int hend = min(hstart + kernel_h, height + pad_h);
+    int wend = min(wstart + kernel_w, width + pad_w);
+    const int pool_size = (hend - hstart) * (wend - wstart);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    hend = min(hend, height);
+    wend = min(wend, width);
+    T aveval = 0;
+    bottom_data =
+    bottom_data + (n * channels + c) * height * width;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        aveval += bottom_data[h * width + w];
+      }
+    }
+    top_data[index] = aveval / pool_size;
+  }
+
+}
--- a/modules/dnn/src/opencl/softmax.cl
+++ b/modules/dnn/src/opencl/softmax.cl
@ -0,0 +1,75 @@
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
+
+__kernel void kernel_channel_max(const int num, const int channels,
+    const int spatial_dim, __global const T* data, __global T* out) {
+  int index = get_global_id(0);
+  if(index < num * spatial_dim) {
+    int n = index / spatial_dim;
+    int s = index % spatial_dim;
+    T maxval = -FLT_MAX;
+    for (int c = 0; c < channels; ++c) {
+      maxval = max(data[(n * channels + c) * spatial_dim + s], maxval);
+    }
+    out[index] = maxval;
+  }
+}
+
+__kernel void kernel_channel_subtract(const int count,
+    const int num, const int channels,
+    const int spatial_dim, __global const T* channel_max, __global T* data) {
+  int index = get_global_id(0);
+  if(index < count) {
+    int n = index / channels / spatial_dim;
+    int s = index % spatial_dim;
+    data[index] -= channel_max[n * spatial_dim + s];
+  }
+}
+
+__kernel void kernel_channel_sum(const int num, const int channels,
+    const int spatial_dim, __global const T* data, __global T* channel_sum) {
+  int index = get_global_id(0);
+  if(index < num * spatial_dim) {
+    int n = index / spatial_dim;
+    int s = index % spatial_dim;
+    T sum = 0;
+    for (int c = 0; c < channels; ++c) {
+      sum += data[(n * channels + c) * spatial_dim + s];
+    }
+    channel_sum[index] = sum;
+  }
+}
+
+__kernel void kernel_channel_div(const int count,
+    const int num, const int channels,
+    const int spatial_dim, __global const T* channel_sum, __global T* data) {
+  int index = get_global_id(0);
+  if(index < count) {
+    int n = index / channels / spatial_dim;
+    int s = index % spatial_dim;
+    data[index] /= channel_sum[n * spatial_dim + s];
+  }
+}
--- a/modules/dnn/src/precomp.hpp
+++ b/modules/dnn/src/precomp.hpp
@ -40,4 +40,5 @@
 //M*/

 #include <opencv2/core.hpp>
+#include "cvconfig.h"
 #include <opencv2/dnn.hpp>
--- a/modules/dnn/src/torch/torch_importer.cpp
+++ b/modules/dnn/src/torch/torch_importer.cpp
@ -52,6 +52,12 @@ namespace dnn {
 #if defined(ENABLE_TORCH_IMPORTER) && ENABLE_TORCH_IMPORTER
 #include "THDiskFile.h"

+#ifdef NDEBUG
+static bool dbgPrint = false;
+#else
+static bool dbgPrint = true;
+#endif
+
 enum LuaType
 {
    TYPE_NIL      = 0,
@ -290,7 +296,8 @@ struct TorchImporter : public ::cv::dnn::Importer
            }

            String key = readString();
-            std::cout << i << "th key: " << key << "\n";
+            if (dbgPrint)
+                std::cout << i << "th key: " << key << "\n";

            fpos = THFile_position(file);
            int vtype = readInt();
@ -334,13 +341,16 @@ struct TorchImporter : public ::cv::dnn::Importer
        }

        //Debug output
-        std::cout << "scalarParams:\n";
-        std::cout << scalarParams;
+        if (dbgPrint)
+        {
+            std::cout << "scalarParams:\n";
+            std::cout << scalarParams;

-        std::cout << "#" << tensorParams.size() << " tensorParams:\n";
-        std::map<String,Blob>::const_iterator it;
-        for (it = tensorParams.begin(); it != tensorParams.end(); it++)
-            std::cout << it->first << ": Tensor " << it->second.shape() << "\n";
+            std::cout << "#" << tensorParams.size() << " tensorParams:\n";
+            std::map<String,Blob>::const_iterator it;
+            for (it = tensorParams.begin(); it != tensorParams.end(); it++)
+                std::cout << it->first << ": Tensor " << it->second.shape() << "\n";
+        }
    }

    void readTorchTensor(int indexTensor, int typeTensor)
@ -435,7 +445,9 @@ struct TorchImporter : public ::cv::dnn::Importer

        String className = readTorchClassName();
        String nnName;
-        std::cout << "Class: " << className << std::endl;
+
+        if (dbgPrint)
+            std::cout << "Class: " << className << std::endl;

        int type;
        if ( (type = parseTensorType(className)) >= 0 ) //is Tensor
--- a/modules/dnn/test/test_googlenet.cpp
+++ b/modules/dnn/test/test_googlenet.cpp
@ -42,6 +42,8 @@
 #if defined(ENABLE_CAFFE_MODEL_TESTS)
 #include "test_precomp.hpp"
 #include "npy_blob.hpp"
+#include <opencv2/core/ocl.hpp>
+#include <opencv2/ts/ocl_test.hpp>

 namespace cvtest
 {
@ -55,7 +57,7 @@ static std::string _tf(TString filename)
    return (getOpenCVExtraDir() + "/dnn/") + filename;
 }

-TEST(Reproducibility_GoogLeNet, Accuracy)
+static void launchGoogleNetTest()
 {
    Net net;
    {
@ -69,7 +71,7 @@ TEST(Reproducibility_GoogLeNet, Accuracy)
    inpMats.push_back( imread(_tf("googlenet_1.jpg")) );
    ASSERT_TRUE(!inpMats[0].empty() && !inpMats[1].empty());

-    net.setBlob(".data", Blob(inpMats));
+    net.setBlob(".data", Blob::fromImages(inpMats));
    net.forward();

    Blob out = net.getBlob("prob");
@ -77,5 +79,16 @@ TEST(Reproducibility_GoogLeNet, Accuracy)
    normAssert(out, ref);
 }

+TEST(Reproducibility_GoogLeNet, Accuracy)
+{
+    OCL_OFF(launchGoogleNetTest());
+}
+
+OCL_TEST(Reproducibility_GoogLeNet, Accuracy)
+{
+    OCL_ON(launchGoogleNetTest());
+    OCL_OFF();
+}
+
 }
 #endif
--- a/modules/dnn/test/test_layers.cpp
+++ b/modules/dnn/test/test_layers.cpp
@ -43,6 +43,8 @@
 #include <opencv2/core/ocl.hpp>
 #include <iostream>
 #include "npy_blob.hpp"
+#include <opencv2/dnn/all_layers.hpp>
+#include <opencv2/ts/ocl_test.hpp>

 namespace cvtest
 {
@ -56,7 +58,32 @@ static String _tf(TString filename)
    return (getOpenCVExtraDir() + "/dnn/layers/") + filename;
 }

-static void testLayer(String basename, bool useCaffeModel = false, bool useCommonInputBlob = true)
+
+enum RunLayerMode
+{
+    ALLOC_ONLY = 1,
+    FORWARD_ONLY = 2,
+    ALLOC_AND_FORWARD = ALLOC_ONLY | FORWARD_ONLY
+};
+
+typedef Ptr<std::vector<Blob*> > PtrToVecPtrBlob;
+
+PtrToVecPtrBlob
+runLayer(Ptr<Layer> layer, std::vector<Blob> &inpBlobs, std::vector<Blob> &outBlobs, int mode = ALLOC_AND_FORWARD)
+{
+    PtrToVecPtrBlob inpPtrs(new std::vector<Blob*>());
+    inpPtrs->reserve(inpBlobs.size());
+    for (size_t i = 0; i < inpBlobs.size(); i++)
+        inpPtrs->push_back(&inpBlobs[i]);
+
+    if (mode & ALLOC_ONLY) layer->allocate(*inpPtrs, outBlobs);
+    if (mode & FORWARD_ONLY) layer->forward(*inpPtrs, outBlobs);
+
+    return inpPtrs;
+}
+
+
+void testLayerUsingCaffeModels(String basename, bool useCaffeModel = false, bool useCommonInputBlob = true)
 {
    String prototxt = _tf(basename + ".prototxt");
    String caffemodel = _tf(basename + ".caffemodel");
@ -64,6 +91,8 @@ static void testLayer(String basename, bool useCaffeModel = false, bool useCommo
    String inpfile = (useCommonInputBlob) ? _tf("blob.npy") : _tf(basename + ".input.npy");
    String outfile = _tf(basename + ".npy");

+    cv::setNumThreads(cv::getNumberOfCPUs());
+
    Net net;
    {
        Ptr<Importer> importer = createCaffeImporter(prototxt, (useCaffeModel) ? caffemodel : String());
@ -83,58 +112,89 @@ static void testLayer(String basename, bool useCaffeModel = false, bool useCommo

 TEST(Layer_Test_Softmax, Accuracy)
 {
-     testLayer("layer_softmax");
+     OCL_OFF(testLayerUsingCaffeModels("layer_softmax"));
+}
+OCL_TEST(Layer_Test_Softmax, Accuracy)
+{
+     OCL_ON(testLayerUsingCaffeModels("layer_softmax"));
+     OCL_OFF();
 }

 TEST(Layer_Test_LRN_spatial, Accuracy)
 {
-     testLayer("layer_lrn_spatial");
+     OCL_OFF(testLayerUsingCaffeModels("layer_lrn_spatial"));
+}
+OCL_TEST(Layer_Test_LRN_spatial, Accuracy)
+{
+     OCL_ON(testLayerUsingCaffeModels("layer_lrn_spatial"));
+     OCL_OFF();
 }

 TEST(Layer_Test_LRN_channels, Accuracy)
 {
-     testLayer("layer_lrn_channels");
+     OCL_OFF(testLayerUsingCaffeModels("layer_lrn_channels"));
+}
+OCL_TEST(Layer_Test_LRN_channels, Accuracy)
+{
+    OCL_ON(testLayerUsingCaffeModels("layer_lrn_channels"));
+    OCL_OFF();
 }

 TEST(Layer_Test_Convolution, Accuracy)
 {
-     testLayer("layer_convolution", true);
+     OCL_OFF(testLayerUsingCaffeModels("layer_convolution", true));
+}
+OCL_TEST(Layer_Test_Convolution, Accuracy)
+{
+     OCL_ON(testLayerUsingCaffeModels("layer_convolution", true));
+     OCL_OFF();
 }

-//TODO: move this test into separate file
-TEST(Layer_Test_Convolution, AccuracyOCL)
+TEST(Layer_Test_DeConvolution, Accuracy)
 {
-    if (cv::ocl::haveOpenCL())
-    {
-        cv::ocl::setUseOpenCL(true);
-        testLayer("layer_convolution", true);
-        cv::ocl::setUseOpenCL(false);
-    }
+     OCL_OFF(testLayerUsingCaffeModels("layer_deconvolution", true, false));
+}
+OCL_TEST(Layer_Test_DeConvolution, Accuracy)
+{
+     OCL_ON(testLayerUsingCaffeModels("layer_deconvolution", true, false););
+     OCL_OFF();
 }

 TEST(Layer_Test_InnerProduct, Accuracy)
 {
-     testLayer("layer_inner_product", true);
+     OCL_OFF(testLayerUsingCaffeModels("layer_inner_product", true));
+}
+OCL_TEST(Layer_Test_InnerProduct, Accuracy)
+{
+    OCL_ON(testLayerUsingCaffeModels("layer_inner_product", true));
+    OCL_OFF();
 }

 TEST(Layer_Test_Pooling_max, Accuracy)
 {
-     testLayer("layer_pooling_max");
+     OCL_OFF(testLayerUsingCaffeModels("layer_pooling_max"));
+     OCL_ON();
+}
+OCL_TEST(Layer_Test_Pooling_max, Accuracy)
+{
+     OCL_ON(testLayerUsingCaffeModels("layer_pooling_max"));
+     OCL_OFF();
 }

 TEST(Layer_Test_Pooling_ave, Accuracy)
 {
-     testLayer("layer_pooling_ave");
+     OCL_OFF(testLayerUsingCaffeModels("layer_pooling_ave"));
+     OCL_ON();
 }
-
-TEST(Layer_Test_DeConvolution, Accuracy)
+OCL_TEST(Layer_Test_Pooling_ave, Accuracy)
 {
-     testLayer("layer_deconvolution", true, false);
+     OCL_ON(testLayerUsingCaffeModels("layer_pooling_ave"));
+     OCL_OFF();
 }

 TEST(Layer_Test_MVN, Accuracy)
 {
-     testLayer("layer_mvn");
+     OCL_OFF(testLayerUsingCaffeModels("layer_mvn"));
 }

 TEST(Layer_Test_Reshape, squeeze)
@ -151,10 +211,31 @@ TEST(Layer_Test_Reshape, squeeze)
    rl->allocate(inpVec, outVec);
    rl->forward(inpVec, outVec);

-    EXPECT_EQ(outVec[0].shape(), BlobShape(Vec3i(4, 3, 2)));
+    EXPECT_EQ(outVec[0].shape(), BlobShape(4, 3, 2));
 }

-TEST(Layer_Test_Reshape_Split_Slice, Accuracy)
+//template<typename XMat>
+//static void test_Layer_Concat()
+//{
+//    Matx21f a(1.f, 1.f), b(2.f, 2.f), c(3.f, 3.f);
+//    std::vector<Blob> res(1), src = { Blob(XMat(a)), Blob(XMat(b)), Blob(XMat(c)) };
+//    Blob ref(XMat(Matx23f(1.f, 2.f, 3.f, 1.f, 2.f, 3.f)));
+//
+//    runLayer(ConcatLayer::create(1), src, res);
+//    normAssert(ref, res[0]);
+//}
+//TEST(Layer_Concat, Accuracy)
+//{
+//    OCL_OFF(test_Layer_Concat<Mat>());
+//}
+//OCL_TEST(Layer_Concat, Accuracy)
+//{
+//    OCL_ON(test_Layer_Concat<Mat>());
+//    OCL_OFF();
+//}
+
+template<typename XMat>
+void test_Reshape_Split_Slice_layers()
 {
    Net net;
    {
@ -163,9 +244,9 @@ TEST(Layer_Test_Reshape_Split_Slice, Accuracy)
        importer->populateNet(net);
    }

-    Blob input(BlobShape(Vec2i(6, 12)));
+    Blob input(BlobShape(6, 12));
    RNG rng(0);
-    rng.fill(input.matRef(), RNG::UNIFORM, -1, 1);
+    rng.fill(input.getRef<XMat>(), RNG::UNIFORM, -1, 1);

    net.setBlob(".input", input);
    net.forward();
@ -173,5 +254,143 @@ TEST(Layer_Test_Reshape_Split_Slice, Accuracy)

    normAssert(input, output);
 }
+TEST(Layer_Test_Reshape_Split_Slice, Accuracy)
+{
+    OCL_OFF(test_Reshape_Split_Slice_layers<Mat>());
+}
+OCL_TEST(Layer_Test_Reshape_Split_Slice, Accuracy)
+{
+    OCL_ON(test_Reshape_Split_Slice_layers<UMat>());
+    OCL_OFF();
+}
+
+class Layer_LSTM_Test : public ::testing::Test
+{
+public:
+    int numInp, numOut;
+    Blob Wh, Wx, b;
+    Ptr<LSTMLayer> layer;
+    std::vector<Blob> inputs, outputs;
+
+    Layer_LSTM_Test() {}
+
+    void init(const BlobShape &inpShape_, const BlobShape &outShape_)
+    {
+        numInp = inpShape_.total();
+        numOut = outShape_.total();
+
+        Wh = Blob(BlobShape(4 * numOut, numOut));
+        Wx = Blob(BlobShape(4 * numOut, numInp));
+        b  = Blob(BlobShape(4 * numOut, 1));
+
+        layer = LSTMLayer::create();
+        layer->setWeights(Wh, Wx, b);
+        layer->setOutShape(outShape_);
+    }
+};
+
+TEST_F(Layer_LSTM_Test, get_set_test)
+{
+    BlobShape TN(4);
+    BlobShape inpShape(5, 3, 2), inpResShape = TN + inpShape;
+    BlobShape outShape(3, 1, 2), outResShape = TN + outShape;
+
+    init(inpShape, outShape);
+    layer->setProduceCellOutput(true);
+    layer->setUseTimstampsDim(false);
+    layer->setOutShape(outShape);
+
+    layer->setC(Blob(outResShape));
+    layer->setH(Blob(outResShape));
+
+    inputs.push_back(Blob(inpResShape));
+    runLayer(layer, inputs, outputs);
+
+    EXPECT_EQ(2, outputs.size());
+    EXPECT_EQ(outResShape, outputs[0].shape());
+    EXPECT_EQ(outResShape, outputs[1].shape());
+
+    EXPECT_EQ(outResShape, layer->getC().shape());
+    EXPECT_EQ(outResShape, layer->getH().shape());
+
+    EXPECT_EQ(0, layer->inputNameToIndex("x"));
+    EXPECT_EQ(0, layer->outputNameToIndex("h"));
+    EXPECT_EQ(1, layer->outputNameToIndex("c"));
+}
+
+TEST(Layer_LSTM_Test_Accuracy_with_, CaffeRecurrent)
+{
+    Ptr<LSTMLayer> layer = LSTMLayer::create();
+
+    Blob Wx = blobFromNPY(_tf("lstm.prototxt.w_0.npy"));
+    Blob Wh = blobFromNPY(_tf("lstm.prototxt.w_2.npy"));
+    Blob b  = blobFromNPY(_tf("lstm.prototxt.w_1.npy"));
+    layer->setWeights(Wh, Wx, b);
+
+    Blob inp = blobFromNPY(_tf("recurrent.input.npy"));
+    std::vector<Blob> inputs(1, inp), outputs;
+    runLayer(layer, inputs, outputs);
+
+    Blob h_t_reference = blobFromNPY(_tf("lstm.prototxt.h_1.npy"));
+    normAssert(h_t_reference, outputs[0]);
+}
+
+TEST(Layer_RNN_Test_Accuracy_with_, CaffeRecurrent)
+{
+    Ptr<RNNLayer> layer = RNNLayer::create();
+
+    layer->setWeights(
+                blobFromNPY(_tf("rnn.prototxt.w_0.npy")),
+                blobFromNPY(_tf("rnn.prototxt.w_1.npy")),
+                blobFromNPY(_tf("rnn.prototxt.w_2.npy")),
+                blobFromNPY(_tf("rnn.prototxt.w_3.npy")),
+                blobFromNPY(_tf("rnn.prototxt.w_4.npy")) );
+
+    std::vector<Blob> output, input(1, blobFromNPY(_tf("recurrent.input.npy")));
+    runLayer(layer, input, output);
+
+    Blob h_ref = blobFromNPY(_tf("rnn.prototxt.h_1.npy"));
+    normAssert(h_ref, output[0]);
+}
+
+
+class Layer_RNN_Test : public ::testing::Test
+{
+public:
+    int nX, nH, nO, nT, nS;
+    Blob Whh, Wxh, bh, Who, bo;
+    Ptr<RNNLayer> layer;
+
+    std::vector<Blob> inputs, outputs;
+
+    Layer_RNN_Test()
+    {
+        nT = 3;
+        nS = 5;
+        nX = 31;
+        nH = 64;
+        nO = 100;
+
+        Whh = Blob(BlobShape(nH, nH));
+        Wxh = Blob(BlobShape(nH, nX));
+        bh  = Blob(BlobShape(nH, 1));
+        Who = Blob(BlobShape(nO, nH));
+        bo  = Blob(BlobShape(nO, 1));
+
+        layer = RNNLayer::create();
+        layer->setProduceHiddenOutput(true);
+        layer->setWeights(Wxh, bh, Whh, Who, bo);
+    }
+};
+
+TEST_F(Layer_RNN_Test, get_set_test)
+{
+    inputs.push_back(Blob(BlobShape(nT, nS, 1, nX)));
+    runLayer(layer, inputs, outputs);
+
+    EXPECT_EQ(outputs.size(), 2);
+    EXPECT_EQ(outputs[0].shape(), BlobShape(nT, nS, nO));
+    EXPECT_EQ(outputs[1].shape(), BlobShape(nT, nS, nH));
+}

 }
--- a/modules/dnn/test/test_main.cpp
+++ b/modules/dnn/test/test_main.cpp
@ -1,3 +1,31 @@
 #include "test_precomp.hpp"

 CV_TEST_MAIN("")
+
+namespace cvtest
+{
+
+using namespace cv;
+using namespace cv::dnn;
+
+TEST(BlobShape_SimpleConstr, Regression)
+{
+    BlobShape sd;
+
+    BlobShape s1(0);
+    EXPECT_EQ(s1.dims(), 1);
+    EXPECT_EQ(s1[0], 0);
+
+    BlobShape s2(0, 0);
+    EXPECT_EQ(s2.dims(), 2);
+    EXPECT_EQ(s2[0], 0);
+    EXPECT_EQ(s2[1], 0);
+}
+
+TEST(BlobShape_EmptyFill, Regression)
+{
+    BlobShape s(10, (int*)NULL);
+    EXPECT_EQ(s.dims(), 10);
+}
+
+}
--- a/modules/dnn/testdata/dnn/.gitignore
+++ b/modules/dnn/testdata/dnn/.gitignore
@ -0,0 +1 @@
+*.caffemodel