diff --git a/modules/text/README.md b/modules/text/README.md
index 488518a28..b6955fd98 100644
--- a/modules/text/README.md
+++ b/modules/text/README.md
@@ -47,3 +47,12 @@ Notes
 2. Tesseract configure script may fail to detect leptonica, so you may have to edit the configure script - comment off some if's around this message and retain only "then" branch.
 
 3. You are encouraged to search the Net for some better pre-trained classifiers, as well as classifiers for other languages.
+
+
+Text Detection CNN
+=================
+
+Intro
+-----
+
+The text module now have a text detection and recognition using deep CNN. The text detector deep CNN that takes an image which may contain multiple words. This outputs a list of Rects with bounding boxes and probability of text there. The text recognizer provides a probabillity over a given vocabulary for each of these rects.
diff --git a/modules/text/cmake/FindTesseract.cmake b/modules/text/cmake/FindTesseract.cmake
index 2a5d868f9..5bdbe2436 100644
--- a/modules/text/cmake/FindTesseract.cmake
+++ b/modules/text/cmake/FindTesseract.cmake
@@ -5,14 +5,17 @@ endif()
 if(NOT Tesseract_FOUND)
   find_path(Tesseract_INCLUDE_DIR tesseract/baseapi.h
     HINTS
+    /usr/include
     /usr/local/include)
 
   find_library(Tesseract_LIBRARY NAMES tesseract
     HINTS
+    /usr/lib
     /usr/local/lib)
 
   find_library(Lept_LIBRARY NAMES lept
     HINTS
+    /usr/lib
     /usr/local/lib)
 
   if(Tesseract_INCLUDE_DIR AND Tesseract_LIBRARY AND Lept_LIBRARY)
diff --git a/modules/text/doc/text.bib b/modules/text/doc/text.bib
index 64a8f4a19..d2ed9f9b6 100644
--- a/modules/text/doc/text.bib
+++ b/modules/text/doc/text.bib
@@ -31,4 +31,14 @@
   journal   = {CoRR},
   volume    = {abs/1407.7504},
   year      = {2014},
-}
\ No newline at end of file
+}
+@inproceedings{LiaoSBWL17,
+  author    = {Minghui Liao and
+               Baoguang Shi and
+               Xiang Bai and
+               Xinggang Wang and
+               Wenyu Liu},
+  title     = {TextBoxes: {A} Fast Text Detector with a Single Deep Neural Network},
+  booktitle = {AAAI},
+  year      = {2017}
+}
diff --git a/modules/text/include/opencv2/text.hpp b/modules/text/include/opencv2/text.hpp
index cea49c69c..c06c88983 100644
--- a/modules/text/include/opencv2/text.hpp
+++ b/modules/text/include/opencv2/text.hpp
@@ -41,6 +41,7 @@ the use of this software, even if advised of the possibility of such damage.
 
 #include "opencv2/text/erfilter.hpp"
 #include "opencv2/text/ocr.hpp"
+#include "opencv2/text/textDetector.hpp"
 
 /** @defgroup text Scene Text Detection and Recognition
 
diff --git a/modules/text/include/opencv2/text/ocr.hpp b/modules/text/include/opencv2/text/ocr.hpp
index 645afeaef..c8e0129be 100644
--- a/modules/text/include/opencv2/text/ocr.hpp
+++ b/modules/text/include/opencv2/text/ocr.hpp
@@ -44,6 +44,8 @@
 #ifndef __OPENCV_TEXT_OCR_HPP__
 #define __OPENCV_TEXT_OCR_HPP__
 
+#include <opencv2/core.hpp>
+
 #include <vector>
 #include <string>
 
diff --git a/modules/text/include/opencv2/text/textDetector.hpp b/modules/text/include/opencv2/text/textDetector.hpp
new file mode 100644
index 000000000..fdb92fdfb
--- /dev/null
+++ b/modules/text/include/opencv2/text/textDetector.hpp
@@ -0,0 +1,73 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef __OPENCV_TEXT_TEXTDETECTOR_HPP__
+#define __OPENCV_TEXT_TEXTDETECTOR_HPP__
+
+#include"ocr.hpp"
+
+namespace cv
+{
+namespace text
+{
+
+//! @addtogroup text_detect
+//! @{
+
+/** @brief An abstract class providing interface for text detection algorithms
+ */
+class CV_EXPORTS_W TextDetector
+{
+public:
+    /**
+    @brief Method that provides a quick and simple interface to detect text inside an image
+
+    @param inputImage an image to process
+    @param Bbox a vector of Rect that will store the detected word bounding box
+    @param confidence a vector of float that will be updated with the confidence the classifier has for the selected bounding box
+    */
+    CV_WRAP virtual void detect(InputArray inputImage, CV_OUT std::vector<Rect>& Bbox, CV_OUT std::vector<float>& confidence) = 0;
+    virtual ~TextDetector() {}
+};
+
+/** @brief TextDetectorCNN class provides the functionallity of text bounding box detection.
+ This class is representing to find bounding boxes of text words given an input image.
+ This class uses OpenCV dnn module to load pre-trained model described in @cite LiaoSBWL17.
+ The original repository with the modified SSD Caffe version: https://github.com/MhLiao/TextBoxes.
+ Model can be downloaded from [DropBox](https://www.dropbox.com/s/g8pjzv2de9gty8g/TextBoxes_icdar13.caffemodel?dl=0).
+ Modified .prototxt file with the model description can be found in `opencv_contrib/modules/text/samples/textbox.prototxt`.
+ */
+class CV_EXPORTS_W TextDetectorCNN : public TextDetector
+{
+public:
+    /**
+    @overload
+
+    @param inputImage an image expected to be a CV_U8C3 of any size
+    @param Bbox a vector of Rect that will store the detected word bounding box
+    @param confidence a vector of float that will be updated with the confidence the classifier has for the selected bounding box
+    */
+    CV_WRAP virtual void detect(InputArray inputImage, CV_OUT std::vector<Rect>& Bbox, CV_OUT std::vector<float>& confidence) = 0;
+
+    /** @brief Creates an instance of the TextDetectorCNN class using the provided parameters.
+
+    @param modelArchFilename the relative or absolute path to the prototxt file describing the classifiers architecture.
+    @param modelWeightsFilename the relative or absolute path to the file containing the pretrained weights of the model in caffe-binary form.
+    @param detectionSizes a list of sizes for multiscale detection. The values`[(300,300),(700,500),(700,300),(700,700),(1600,1600)]` are
+    recommended in @cite LiaoSBWL17 to achieve the best quality.
+    */
+    static Ptr<TextDetectorCNN> create(const String& modelArchFilename, const String& modelWeightsFilename,
+                                               std::vector<Size> detectionSizes);
+    /**
+      @overload
+    */
+    CV_WRAP static Ptr<TextDetectorCNN> create(const String& modelArchFilename, const String& modelWeightsFilename);
+};
+
+//! @}
+}//namespace text
+}//namespace cv
+
+
+#endif // _OPENCV_TEXT_OCR_HPP_
diff --git a/modules/text/samples/deeptextdetection.py b/modules/text/samples/deeptextdetection.py
new file mode 100644
index 000000000..256a28e9e
--- /dev/null
+++ b/modules/text/samples/deeptextdetection.py
@@ -0,0 +1,37 @@
+# -*- coding: utf-8 -*-
+#!/usr/bin/python
+import sys
+import os
+import cv2
+import numpy as np
+
+def main():
+    print('\nDeeptextdetection.py')
+    print('       A demo script of text box alogorithm of the paper:')
+    print('       * Minghui Liao et al.: TextBoxes: A Fast Text Detector with a Single Deep Neural Network https://arxiv.org/abs/1611.06779\n')
+
+    if (len(sys.argv) < 2):
+        print(' (ERROR) You must call this script with an argument (path_to_image_to_be_processed)\n')
+        quit()
+
+    if not os.path.isfile('TextBoxes_icdar13.caffemodel') or not os.path.isfile('textbox.prototxt'):
+        print " Model files not found in current directory. Aborting"
+        print " See the documentation of text::TextDetectorCNN class to get download links."
+        quit()
+
+    img = cv2.imread(str(sys.argv[1]))
+    textSpotter = cv2.text.TextDetectorCNN_create("textbox.prototxt", "TextBoxes_icdar13.caffemodel")
+    rects, outProbs = textSpotter.detect(img);
+    vis = img.copy()
+    thres = 0.6
+
+    for r in range(np.shape(rects)[0]):
+        if outProbs[r] > thres:
+            rect = rects[r]
+            cv2.rectangle(vis, (rect[0],rect[1]), (rect[0] + rect[2], rect[1] + rect[3]), (255, 0, 0), 2)
+
+    cv2.imshow("Text detection result", vis)
+    cv2.waitKey()
+
+if __name__ == "__main__":
+    main()
diff --git a/modules/text/samples/dictnet_demo.cpp b/modules/text/samples/dictnet_demo.cpp
index 277a1c9be..f70f2c175 100644
--- a/modules/text/samples/dictnet_demo.cpp
+++ b/modules/text/samples/dictnet_demo.cpp
@@ -1,12 +1,3 @@
-/*
- * dictnet_demo.cpp
- *
- * Demonstrates simple use of the holistic word classifier in C++
- *
- * Created on: June 26, 2016
- *     Author: Anguelos Nicolaou <anguelos.nicolaou AT gmail.com>
- */
-
 #include  "opencv2/text.hpp"
 #include  "opencv2/highgui.hpp"
 #include  "opencv2/imgproc.hpp"
diff --git a/modules/text/samples/text_recognition_cnn.cpp b/modules/text/samples/text_recognition_cnn.cpp
new file mode 100644
index 000000000..84df57d29
--- /dev/null
+++ b/modules/text/samples/text_recognition_cnn.cpp
@@ -0,0 +1,122 @@
+#include <opencv2/text.hpp>
+#include <opencv2/highgui.hpp>
+#include <opencv2/imgproc.hpp>
+#include <opencv2/dnn.hpp>
+
+#include  <iostream>
+#include  <fstream>
+
+using namespace cv;
+using namespace std;
+
+namespace
+{
+void printHelpStr(const string& progFname)
+{
+    cout << "   Demo of text recognition CNN for text detection." << endl
+         << "   Max Jaderberg et al.: Reading Text in the Wild with Convolutional Neural Networks, IJCV 2015"<<endl<<endl
+         << "   Usage: " << progFname << " <output_file> <input_image>" << endl
+         << "   Caffe Model files (textbox.prototxt, TextBoxes_icdar13.caffemodel)"<<endl
+         << "     must be in the current directory. See the documentation of text::TextDetectorCNN class to get download links." << endl
+         << "   Obtaining text recognition Caffe Model files in linux shell:" << endl
+         << "   wget http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg.caffemodel" << endl
+         << "   wget http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg_deploy.prototxt" << endl
+         << "   wget http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg_labels.txt" <<endl << endl;
+}
+
+bool fileExists (const string& filename)
+{
+    ifstream f(filename.c_str());
+    return f.good();
+}
+
+void textbox_draw(Mat src, std::vector<Rect>& groups, std::vector<float>& probs, std::vector<int>& indexes)
+{
+    for (size_t i = 0; i < indexes.size(); i++)
+    {
+        if (src.type() == CV_8UC3)
+        {
+            Rect currrentBox = groups[indexes[i]];
+            rectangle(src, currrentBox, Scalar( 0, 255, 255 ), 2, LINE_AA);
+            String label = format("%.2f", probs[indexes[i]]);
+            std::cout << "text box: " << currrentBox << " confidence: " << probs[indexes[i]] << "\n";
+
+            int baseLine = 0;
+            Size labelSize = getTextSize(label, FONT_HERSHEY_PLAIN, 1, 1, &baseLine);
+            int yLeftBottom = std::max(currrentBox.y, labelSize.height);
+            rectangle(src, Point(currrentBox.x, yLeftBottom - labelSize.height),
+                      Point(currrentBox.x + labelSize.width, yLeftBottom + baseLine), Scalar( 255, 255, 255 ), FILLED);
+
+            putText(src, label, Point(currrentBox.x, yLeftBottom), FONT_HERSHEY_PLAIN, 1, Scalar( 0,0,0 ), 1, LINE_AA);
+        }
+        else
+            rectangle(src, groups[i], Scalar( 255 ), 3, 8 );
+    }
+}
+
+}
+
+int main(int argc, const char * argv[])
+{
+    if (argc < 2)
+    {
+        printHelpStr(argv[0]);
+        cout << "Insufiecient parameters. Aborting!" << endl;
+        exit(1);
+    }
+
+    const string modelArch = "textbox.prototxt";
+    const string moddelWeights = "TextBoxes_icdar13.caffemodel";
+
+    if (!fileExists(modelArch) || !fileExists(moddelWeights))
+    {
+        printHelpStr(argv[0]);
+        cout << "Model files not found in the current directory. Aborting!" << endl;
+        exit(1);
+    }
+
+    Mat image = imread(String(argv[1]), IMREAD_COLOR);
+
+    cout << "Starting Text Box Demo" << endl;
+    Ptr<text::TextDetectorCNN> textSpotter =
+            text::TextDetectorCNN::create(modelArch, moddelWeights);
+
+    vector<Rect> bbox;
+    vector<float> outProbabillities;
+    textSpotter->detect(image, bbox, outProbabillities);
+    std::vector<int> indexes;
+    cv::dnn::NMSBoxes(bbox, outProbabillities, 0.4f, 0.5f, indexes);
+
+    Mat image_copy = image.clone();
+    textbox_draw(image_copy, bbox, outProbabillities, indexes);
+    imshow("Text detection", image_copy);
+    image_copy = image.clone();
+
+    Ptr<text::OCRHolisticWordRecognizer> wordSpotter =
+            text::OCRHolisticWordRecognizer::create("dictnet_vgg_deploy.prototxt", "dictnet_vgg.caffemodel", "dictnet_vgg_labels.txt");
+
+    for(size_t i = 0; i < indexes.size(); i++)
+    {
+        Mat wordImg;
+        cvtColor(image(bbox[indexes[i]]), wordImg, COLOR_BGR2GRAY);
+        string word;
+        vector<float> confs;
+        wordSpotter->run(wordImg, word, NULL, NULL, &confs);
+
+        Rect currrentBox = bbox[indexes[i]];
+        rectangle(image_copy, currrentBox, Scalar( 0, 255, 255 ), 2, LINE_AA);
+
+        int baseLine = 0;
+        Size labelSize = getTextSize(word, FONT_HERSHEY_PLAIN, 1, 1, &baseLine);
+        int yLeftBottom = std::max(currrentBox.y, labelSize.height);
+        rectangle(image_copy, Point(currrentBox.x, yLeftBottom - labelSize.height),
+                  Point(currrentBox.x + labelSize.width, yLeftBottom + baseLine), Scalar( 255, 255, 255 ), FILLED);
+
+        putText(image_copy, word, Point(currrentBox.x, yLeftBottom), FONT_HERSHEY_PLAIN, 1, Scalar( 0,0,0 ), 1, LINE_AA);
+
+    }
+    imshow("Text recognition", image_copy);
+    cout << "Recognition finished. Press any key to exit.\n";
+    waitKey();
+    return 0;
+}
diff --git a/modules/text/samples/textbox.prototxt b/modules/text/samples/textbox.prototxt
new file mode 100644
index 000000000..bb8019828
--- /dev/null
+++ b/modules/text/samples/textbox.prototxt
@@ -0,0 +1,1611 @@
+name: "VGG_text_longer_conv_300x300_deploy"
+input: "data"
+input_shape {
+  dim: 1
+  dim: 3
+  dim: 700
+  dim: 700
+}
+layer {
+  name: "conv1_1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1_1"
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "relu1_1"
+  type: "ReLU"
+  bottom: "conv1_1"
+  top: "conv1_1"
+}
+layer {
+  name: "conv1_2"
+  type: "Convolution"
+  bottom: "conv1_1"
+  top: "conv1_2"
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "relu1_2"
+  type: "ReLU"
+  bottom: "conv1_2"
+  top: "conv1_2"
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "conv1_2"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv2_1"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2_1"
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "relu2_1"
+  type: "ReLU"
+  bottom: "conv2_1"
+  top: "conv2_1"
+}
+layer {
+  name: "conv2_2"
+  type: "Convolution"
+  bottom: "conv2_1"
+  top: "conv2_2"
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "relu2_2"
+  type: "ReLU"
+  bottom: "conv2_2"
+  top: "conv2_2"
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "conv2_2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv3_1"
+  type: "Convolution"
+  bottom: "pool2"
+  top: "conv3_1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "relu3_1"
+  type: "ReLU"
+  bottom: "conv3_1"
+  top: "conv3_1"
+}
+layer {
+  name: "conv3_2"
+  type: "Convolution"
+  bottom: "conv3_1"
+  top: "conv3_2"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "relu3_2"
+  type: "ReLU"
+  bottom: "conv3_2"
+  top: "conv3_2"
+}
+layer {
+  name: "conv3_3"
+  type: "Convolution"
+  bottom: "conv3_2"
+  top: "conv3_3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "relu3_3"
+  type: "ReLU"
+  bottom: "conv3_3"
+  top: "conv3_3"
+}
+layer {
+  name: "pool3"
+  type: "Pooling"
+  bottom: "conv3_3"
+  top: "pool3"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv4_1"
+  type: "Convolution"
+  bottom: "pool3"
+  top: "conv4_1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "relu4_1"
+  type: "ReLU"
+  bottom: "conv4_1"
+  top: "conv4_1"
+}
+layer {
+  name: "conv4_2"
+  type: "Convolution"
+  bottom: "conv4_1"
+  top: "conv4_2"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "relu4_2"
+  type: "ReLU"
+  bottom: "conv4_2"
+  top: "conv4_2"
+}
+layer {
+  name: "conv4_3"
+  type: "Convolution"
+  bottom: "conv4_2"
+  top: "conv4_3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "relu4_3"
+  type: "ReLU"
+  bottom: "conv4_3"
+  top: "conv4_3"
+}
+layer {
+  name: "pool4"
+  type: "Pooling"
+  bottom: "conv4_3"
+  top: "pool4"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv5_1"
+  type: "Convolution"
+  bottom: "pool4"
+  top: "conv5_1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "relu5_1"
+  type: "ReLU"
+  bottom: "conv5_1"
+  top: "conv5_1"
+}
+layer {
+  name: "conv5_2"
+  type: "Convolution"
+  bottom: "conv5_1"
+  top: "conv5_2"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "relu5_2"
+  type: "ReLU"
+  bottom: "conv5_2"
+  top: "conv5_2"
+}
+layer {
+  name: "conv5_3"
+  type: "Convolution"
+  bottom: "conv5_2"
+  top: "conv5_3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "relu5_3"
+  type: "ReLU"
+  bottom: "conv5_3"
+  top: "conv5_3"
+}
+layer {
+  name: "pool5"
+  type: "Pooling"
+  bottom: "conv5_3"
+  top: "pool5"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 1
+    pad: 1
+  }
+}
+layer {
+  name: "fc6"
+  type: "Convolution"
+  bottom: "pool5"
+  top: "fc6"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 1024
+    pad: 6
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+    dilation: 6
+  }
+}
+layer {
+  name: "relu6"
+  type: "ReLU"
+  bottom: "fc6"
+  top: "fc6"
+}
+layer {
+  name: "fc7"
+  type: "Convolution"
+  bottom: "fc6"
+  top: "fc7"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 1024
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "relu7"
+  type: "ReLU"
+  bottom: "fc7"
+  top: "fc7"
+}
+layer {
+  name: "conv6_1"
+  type: "Convolution"
+  bottom: "fc7"
+  top: "conv6_1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 0
+    kernel_size: 1
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv6_1_relu"
+  type: "ReLU"
+  bottom: "conv6_1"
+  top: "conv6_1"
+}
+layer {
+  name: "conv6_2"
+  type: "Convolution"
+  bottom: "conv6_1"
+  top: "conv6_2"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    stride: 2
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv6_2_relu"
+  type: "ReLU"
+  bottom: "conv6_2"
+  top: "conv6_2"
+}
+layer {
+  name: "conv7_1"
+  type: "Convolution"
+  bottom: "conv6_2"
+  top: "conv7_1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 0
+    kernel_size: 1
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv7_1_relu"
+  type: "ReLU"
+  bottom: "conv7_1"
+  top: "conv7_1"
+}
+layer {
+  name: "conv7_2"
+  type: "Convolution"
+  bottom: "conv7_1"
+  top: "conv7_2"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    stride: 2
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv7_2_relu"
+  type: "ReLU"
+  bottom: "conv7_2"
+  top: "conv7_2"
+}
+layer {
+  name: "conv8_1"
+  type: "Convolution"
+  bottom: "conv7_2"
+  top: "conv8_1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 0
+    kernel_size: 1
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv8_1_relu"
+  type: "ReLU"
+  bottom: "conv8_1"
+  top: "conv8_1"
+}
+layer {
+  name: "conv8_2"
+  type: "Convolution"
+  bottom: "conv8_1"
+  top: "conv8_2"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    stride: 2
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv8_2_relu"
+  type: "ReLU"
+  bottom: "conv8_2"
+  top: "conv8_2"
+}
+layer {
+  name: "pool6"
+  type: "Pooling"
+  bottom: "conv8_2"
+  top: "pool6"
+  pooling_param {
+    pool: AVE
+    global_pooling: true
+  }
+}
+layer {
+  name: "conv4_3_norm"
+  type: "Normalize"
+  bottom: "conv4_3"
+  top: "conv4_3_norm"
+  norm_param {
+    across_spatial: false
+    scale_filler {
+      type: "constant"
+      value: 20
+    }
+    channel_shared: false
+  }
+}
+layer {
+  name: "conv4_3_norm_mbox_loc"
+  type: "Convolution"
+  bottom: "conv4_3_norm"
+  top: "conv4_3_norm_mbox_loc"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 48
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+    pad_h: 0
+    pad_w: 2
+    kernel_h: 1
+    kernel_w: 5
+    stride_h: 1
+    stride_w: 1
+  }
+}
+layer {
+  name: "conv4_3_norm_mbox_loc_perm"
+  type: "Permute"
+  bottom: "conv4_3_norm_mbox_loc"
+  top: "conv4_3_norm_mbox_loc_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv4_3_norm_mbox_loc_flat"
+  type: "Flatten"
+  bottom: "conv4_3_norm_mbox_loc_perm"
+  top: "conv4_3_norm_mbox_loc_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv4_3_norm_mbox_conf"
+  type: "Convolution"
+  bottom: "conv4_3_norm"
+  top: "conv4_3_norm_mbox_conf"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 24
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+    pad_h: 0
+    pad_w: 2
+    kernel_h: 1
+    kernel_w: 5
+    stride_h: 1
+    stride_w: 1
+  }
+}
+layer {
+  name: "conv4_3_norm_mbox_conf_perm"
+  type: "Permute"
+  bottom: "conv4_3_norm_mbox_conf"
+  top: "conv4_3_norm_mbox_conf_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv4_3_norm_mbox_conf_flat"
+  type: "Flatten"
+  bottom: "conv4_3_norm_mbox_conf_perm"
+  top: "conv4_3_norm_mbox_conf_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv4_3_norm_mbox_priorbox"
+  type: "PriorBox"
+  bottom: "conv4_3_norm"
+  bottom: "data"
+  top: "conv4_3_norm_mbox_priorbox"
+  prior_box_param {
+    min_size: 30.0
+    aspect_ratio: 2
+    aspect_ratio: 3
+    aspect_ratio: 5
+    aspect_ratio: 7
+    aspect_ratio: 10
+    flip: false
+    clip: true
+    variance: 0.1
+    variance: 0.1
+    variance: 0.2
+    variance: 0.2
+    additional_y_offset: true
+  }
+}
+layer {
+  name: "fc7_mbox_loc"
+  type: "Convolution"
+  bottom: "fc7"
+  top: "fc7_mbox_loc"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 56
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+    pad_h: 0
+    pad_w: 2
+    kernel_h: 1
+    kernel_w: 5
+    stride_h: 1
+    stride_w: 1
+  }
+}
+layer {
+  name: "fc7_mbox_loc_perm"
+  type: "Permute"
+  bottom: "fc7_mbox_loc"
+  top: "fc7_mbox_loc_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "fc7_mbox_loc_flat"
+  type: "Flatten"
+  bottom: "fc7_mbox_loc_perm"
+  top: "fc7_mbox_loc_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "fc7_mbox_conf"
+  type: "Convolution"
+  bottom: "fc7"
+  top: "fc7_mbox_conf"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 28
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+    pad_h: 0
+    pad_w: 2
+    kernel_h: 1
+    kernel_w: 5
+    stride_h: 1
+    stride_w: 1
+  }
+}
+layer {
+  name: "fc7_mbox_conf_perm"
+  type: "Permute"
+  bottom: "fc7_mbox_conf"
+  top: "fc7_mbox_conf_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "fc7_mbox_conf_flat"
+  type: "Flatten"
+  bottom: "fc7_mbox_conf_perm"
+  top: "fc7_mbox_conf_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "fc7_mbox_priorbox"
+  type: "PriorBox"
+  bottom: "fc7"
+  bottom: "data"
+  top: "fc7_mbox_priorbox"
+  prior_box_param {
+    min_size: 60.0
+    max_size: 114.0
+    aspect_ratio: 2
+    aspect_ratio: 3
+    aspect_ratio: 5
+    aspect_ratio: 7
+    aspect_ratio: 10
+    flip: false
+    clip: true
+    variance: 0.1
+    variance: 0.1
+    variance: 0.2
+    variance: 0.2
+    additional_y_offset: true
+  }
+}
+layer {
+  name: "conv6_2_mbox_loc"
+  type: "Convolution"
+  bottom: "conv6_2"
+  top: "conv6_2_mbox_loc"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 56
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+    pad_h: 0
+    pad_w: 2
+    kernel_h: 1
+    kernel_w: 5
+    stride_h: 1
+    stride_w: 1
+  }
+}
+layer {
+  name: "conv6_2_mbox_loc_perm"
+  type: "Permute"
+  bottom: "conv6_2_mbox_loc"
+  top: "conv6_2_mbox_loc_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv6_2_mbox_loc_flat"
+  type: "Flatten"
+  bottom: "conv6_2_mbox_loc_perm"
+  top: "conv6_2_mbox_loc_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv6_2_mbox_conf"
+  type: "Convolution"
+  bottom: "conv6_2"
+  top: "conv6_2_mbox_conf"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 28
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+    pad_h: 0
+    pad_w: 2
+    kernel_h: 1
+    kernel_w: 5
+    stride_h: 1
+    stride_w: 1
+  }
+}
+layer {
+  name: "conv6_2_mbox_conf_perm"
+  type: "Permute"
+  bottom: "conv6_2_mbox_conf"
+  top: "conv6_2_mbox_conf_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv6_2_mbox_conf_flat"
+  type: "Flatten"
+  bottom: "conv6_2_mbox_conf_perm"
+  top: "conv6_2_mbox_conf_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv6_2_mbox_priorbox"
+  type: "PriorBox"
+  bottom: "conv6_2"
+  bottom: "data"
+  top: "conv6_2_mbox_priorbox"
+  prior_box_param {
+    min_size: 114.0
+    max_size: 168.0
+    aspect_ratio: 2
+    aspect_ratio: 3
+    aspect_ratio: 5
+    aspect_ratio: 7
+    aspect_ratio: 10
+    flip: false
+    clip: true
+    variance: 0.1
+    variance: 0.1
+    variance: 0.2
+    variance: 0.2
+    additional_y_offset: true
+  }
+}
+layer {
+  name: "conv7_2_mbox_loc"
+  type: "Convolution"
+  bottom: "conv7_2"
+  top: "conv7_2_mbox_loc"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 56
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+    pad_h: 0
+    pad_w: 2
+    kernel_h: 1
+    kernel_w: 5
+    stride_h: 1
+    stride_w: 1
+  }
+}
+layer {
+  name: "conv7_2_mbox_loc_perm"
+  type: "Permute"
+  bottom: "conv7_2_mbox_loc"
+  top: "conv7_2_mbox_loc_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv7_2_mbox_loc_flat"
+  type: "Flatten"
+  bottom: "conv7_2_mbox_loc_perm"
+  top: "conv7_2_mbox_loc_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv7_2_mbox_conf"
+  type: "Convolution"
+  bottom: "conv7_2"
+  top: "conv7_2_mbox_conf"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 28
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+    pad_h: 0
+    pad_w: 2
+    kernel_h: 1
+    kernel_w: 5
+    stride_h: 1
+    stride_w: 1
+  }
+}
+layer {
+  name: "conv7_2_mbox_conf_perm"
+  type: "Permute"
+  bottom: "conv7_2_mbox_conf"
+  top: "conv7_2_mbox_conf_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv7_2_mbox_conf_flat"
+  type: "Flatten"
+  bottom: "conv7_2_mbox_conf_perm"
+  top: "conv7_2_mbox_conf_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv7_2_mbox_priorbox"
+  type: "PriorBox"
+  bottom: "conv7_2"
+  bottom: "data"
+  top: "conv7_2_mbox_priorbox"
+  prior_box_param {
+    min_size: 168.0
+    max_size: 222.0
+    aspect_ratio: 2
+    aspect_ratio: 3
+    aspect_ratio: 5
+    aspect_ratio: 7
+    aspect_ratio: 10
+    flip: false
+    clip: true
+    variance: 0.1
+    variance: 0.1
+    variance: 0.2
+    variance: 0.2
+    additional_y_offset: true
+  }
+}
+layer {
+  name: "conv8_2_mbox_loc"
+  type: "Convolution"
+  bottom: "conv8_2"
+  top: "conv8_2_mbox_loc"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 56
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+    pad_h: 0
+    pad_w: 2
+    kernel_h: 1
+    kernel_w: 5
+    stride_h: 1
+    stride_w: 1
+  }
+}
+layer {
+  name: "conv8_2_mbox_loc_perm"
+  type: "Permute"
+  bottom: "conv8_2_mbox_loc"
+  top: "conv8_2_mbox_loc_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv8_2_mbox_loc_flat"
+  type: "Flatten"
+  bottom: "conv8_2_mbox_loc_perm"
+  top: "conv8_2_mbox_loc_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv8_2_mbox_conf"
+  type: "Convolution"
+  bottom: "conv8_2"
+  top: "conv8_2_mbox_conf"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 28
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+    pad_h: 0
+    pad_w: 2
+    kernel_h: 1
+    kernel_w: 5
+    stride_h: 1
+    stride_w: 1
+  }
+}
+layer {
+  name: "conv8_2_mbox_conf_perm"
+  type: "Permute"
+  bottom: "conv8_2_mbox_conf"
+  top: "conv8_2_mbox_conf_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv8_2_mbox_conf_flat"
+  type: "Flatten"
+  bottom: "conv8_2_mbox_conf_perm"
+  top: "conv8_2_mbox_conf_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv8_2_mbox_priorbox"
+  type: "PriorBox"
+  bottom: "conv8_2"
+  bottom: "data"
+  top: "conv8_2_mbox_priorbox"
+  prior_box_param {
+    min_size: 222.0
+    max_size: 276.0
+    aspect_ratio: 2
+    aspect_ratio: 3
+    aspect_ratio: 5
+    aspect_ratio: 7
+    aspect_ratio: 10
+    flip: false
+    clip: true
+    variance: 0.1
+    variance: 0.1
+    variance: 0.2
+    variance: 0.2
+    additional_y_offset: true
+  }
+}
+layer {
+  name: "pool6_mbox_loc"
+  type: "Convolution"
+  bottom: "pool6"
+  top: "pool6_mbox_loc"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 56
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+    pad_h: 0
+    pad_w: 2
+    kernel_h: 1
+    kernel_w: 5
+    stride_h: 1
+    stride_w: 1
+  }
+}
+layer {
+  name: "pool6_mbox_loc_perm"
+  type: "Permute"
+  bottom: "pool6_mbox_loc"
+  top: "pool6_mbox_loc_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "pool6_mbox_loc_flat"
+  type: "Flatten"
+  bottom: "pool6_mbox_loc_perm"
+  top: "pool6_mbox_loc_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "pool6_mbox_conf"
+  type: "Convolution"
+  bottom: "pool6"
+  top: "pool6_mbox_conf"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 28
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+    pad_h: 0
+    pad_w: 2
+    kernel_h: 1
+    kernel_w: 5
+    stride_h: 1
+    stride_w: 1
+  }
+}
+layer {
+  name: "pool6_mbox_conf_perm"
+  type: "Permute"
+  bottom: "pool6_mbox_conf"
+  top: "pool6_mbox_conf_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "pool6_mbox_conf_flat"
+  type: "Flatten"
+  bottom: "pool6_mbox_conf_perm"
+  top: "pool6_mbox_conf_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "pool6_mbox_priorbox"
+  type: "PriorBox"
+  bottom: "pool6"
+  bottom: "data"
+  top: "pool6_mbox_priorbox"
+  prior_box_param {
+    min_size: 276.0
+    max_size: 330.0
+    aspect_ratio: 2
+    aspect_ratio: 3
+    aspect_ratio: 5
+    aspect_ratio: 7
+    aspect_ratio: 10
+    flip: false
+    clip: true
+    variance: 0.1
+    variance: 0.1
+    variance: 0.2
+    variance: 0.2
+    additional_y_offset: true
+  }
+}
+layer {
+  name: "mbox_loc"
+  type: "Concat"
+  bottom: "conv4_3_norm_mbox_loc_flat"
+  bottom: "fc7_mbox_loc_flat"
+  bottom: "conv6_2_mbox_loc_flat"
+  bottom: "conv7_2_mbox_loc_flat"
+  bottom: "conv8_2_mbox_loc_flat"
+  bottom: "pool6_mbox_loc_flat"
+  top: "mbox_loc"
+  concat_param {
+    axis: 1
+  }
+}
+layer {
+  name: "mbox_conf"
+  type: "Concat"
+  bottom: "conv4_3_norm_mbox_conf_flat"
+  bottom: "fc7_mbox_conf_flat"
+  bottom: "conv6_2_mbox_conf_flat"
+  bottom: "conv7_2_mbox_conf_flat"
+  bottom: "conv8_2_mbox_conf_flat"
+  bottom: "pool6_mbox_conf_flat"
+  top: "mbox_conf"
+  concat_param {
+    axis: 1
+  }
+}
+layer {
+  name: "mbox_priorbox"
+  type: "Concat"
+  bottom: "conv4_3_norm_mbox_priorbox"
+  bottom: "fc7_mbox_priorbox"
+  bottom: "conv6_2_mbox_priorbox"
+  bottom: "conv7_2_mbox_priorbox"
+  bottom: "conv8_2_mbox_priorbox"
+  bottom: "pool6_mbox_priorbox"
+  top: "mbox_priorbox"
+  concat_param {
+    axis: 2
+  }
+}
+layer {
+  name: "mbox_conf_reshape"
+  type: "Reshape"
+  bottom: "mbox_conf"
+  top: "mbox_conf_reshape"
+  reshape_param {
+    shape {
+      dim: 0
+      dim: -1
+      dim: 2
+    }
+  }
+}
+layer {
+  name: "mbox_conf_softmax"
+  type: "Softmax"
+  bottom: "mbox_conf_reshape"
+  top: "mbox_conf_softmax"
+  softmax_param {
+    axis: 2
+  }
+}
+layer {
+  name: "mbox_conf_flatten"
+  type: "Flatten"
+  bottom: "mbox_conf_softmax"
+  top: "mbox_conf_flatten"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "detection_out"
+  type: "DetectionOutput"
+  bottom: "mbox_loc"
+  bottom: "mbox_conf_flatten"
+  bottom: "mbox_priorbox"
+  top: "detection_out"
+  include {
+    phase: TEST
+  }
+  detection_output_param {
+    num_classes: 2
+    share_location: true
+    background_label_id: 0
+    nms_param {
+      nms_threshold: 0.45
+      top_k: 400
+    }
+    code_type: CENTER_SIZE
+    keep_top_k: 200
+    confidence_threshold: 0.01
+  }
+}
diff --git a/modules/text/samples/textbox_demo.cpp b/modules/text/samples/textbox_demo.cpp
new file mode 100644
index 000000000..1cf9a9aab
--- /dev/null
+++ b/modules/text/samples/textbox_demo.cpp
@@ -0,0 +1,96 @@
+#include <opencv2/text.hpp>
+#include <opencv2/highgui.hpp>
+#include <opencv2/imgproc.hpp>
+#include <opencv2/dnn.hpp>
+
+#include  <sstream>
+#include  <iostream>
+#include  <fstream>
+
+using namespace cv;
+
+namespace
+{
+std::string getHelpStr(const std::string& progFname)
+{
+    std::stringstream out;
+    out << "    Demo of text detection CNN for text detection." << std::endl
+        << "    Minghui Liao, Baoguang Shi, Xiang Bai, Xinggang Wang, Wenyu Liu: TextBoxes: A Fast Text Detector with a Single Deep Neural Network, AAAI2017\n\n"
+        << "    Usage: " << progFname << " <output_file> <input_image>" << std::endl
+        << "    Caffe Model files  (textbox.prototxt, TextBoxes_icdar13.caffemodel)"<<std::endl
+        << "      must be in the current directory. See the documentation of text::TextDetectorCNN class to get download links." << std::endl;
+    return out.str();
+}
+
+bool fileExists (const std::string& filename)
+{
+    std::ifstream f(filename.c_str());
+    return f.good();
+}
+
+void textbox_draw(Mat src, std::vector<Rect>& groups, std::vector<float>& probs, std::vector<int>& indexes)
+{
+    for (size_t i = 0; i < indexes.size(); i++)
+    {
+        if (src.type() == CV_8UC3)
+        {
+            Rect currrentBox = groups[indexes[i]];
+            rectangle(src, currrentBox, Scalar( 0, 255, 255 ), 2, LINE_AA);
+            String label = format("%.2f", probs[indexes[i]]);
+            std::cout << "text box: " << currrentBox << " confidence: " << probs[indexes[i]] << "\n";
+
+            int baseLine = 0;
+            Size labelSize = getTextSize(label, FONT_HERSHEY_PLAIN, 1, 1, &baseLine);
+            int yLeftBottom = std::max(currrentBox.y, labelSize.height);
+            rectangle(src, Point(currrentBox.x, yLeftBottom - labelSize.height),
+                      Point(currrentBox.x + labelSize.width, yLeftBottom + baseLine), Scalar( 255, 255, 255 ), FILLED);
+
+            putText(src, label, Point(currrentBox.x, yLeftBottom), FONT_HERSHEY_PLAIN, 1, Scalar( 0,0,0 ), 1, LINE_AA);
+        }
+        else
+            rectangle(src, groups[i], Scalar( 255 ), 3, 8 );
+    }
+}
+
+}
+
+int main(int argc, const char * argv[])
+{
+    if (argc < 2)
+    {
+        std::cout << getHelpStr(argv[0]);
+        std::cout << "Insufiecient parameters. Aborting!" << std::endl;
+        exit(1);
+    }
+
+    const std::string modelArch = "textbox.prototxt";
+    const std::string moddelWeights = "TextBoxes_icdar13.caffemodel";
+
+    if (!fileExists(modelArch) || !fileExists(moddelWeights))
+    {
+        std::cout << getHelpStr(argv[0]);
+        std::cout << "Model files not found in the current directory. Aborting!" << std::endl;
+        exit(1);
+    }
+
+    Mat image = imread(String(argv[1]), IMREAD_COLOR);
+
+    std::cout << "Starting Text Box Demo" << std::endl;
+    Ptr<text::TextDetectorCNN> textSpotter =
+            text::TextDetectorCNN::create(modelArch, moddelWeights);
+
+    std::vector<Rect> bbox;
+    std::vector<float> outProbabillities;
+    textSpotter->detect(image, bbox, outProbabillities);
+
+    std::vector<int> indexes;
+    cv::dnn::NMSBoxes(bbox, outProbabillities, 0.3f, 0.4f, indexes);
+
+    textbox_draw(image, bbox, outProbabillities, indexes);
+
+    imshow("TextBox Demo",image);
+    std::cout << "Done!" << std::endl << std::endl;
+    std::cout << "Press any key to exit." << std::endl << std::endl;
+    waitKey();
+    return 0;
+}
diff --git a/modules/text/src/ocr_holistic.cpp b/modules/text/src/ocr_holistic.cpp
index 77016edf2..07c4aa98f 100644
--- a/modules/text/src/ocr_holistic.cpp
+++ b/modules/text/src/ocr_holistic.cpp
@@ -1,3 +1,7 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
 #include "precomp.hpp"
 #include "opencv2/imgproc.hpp"
 #include "opencv2/core.hpp"
diff --git a/modules/text/src/text_detectorCNN.cpp b/modules/text/src/text_detectorCNN.cpp
new file mode 100644
index 000000000..84f769b42
--- /dev/null
+++ b/modules/text/src/text_detectorCNN.cpp
@@ -0,0 +1,94 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "precomp.hpp"
+#include "opencv2/imgproc.hpp"
+#include "opencv2/core.hpp"
+#include "opencv2/dnn.hpp"
+
+#include <fstream>
+#include <algorithm>
+
+using namespace cv::dnn;
+
+namespace cv
+{
+namespace text
+{
+
+class TextDetectorCNNImpl : public TextDetectorCNN
+{
+protected:
+    Net net_;
+    std::vector<Size> sizes_;
+    int inputChannelCount_;
+
+    void getOutputs(const float* buffer,int nbrTextBoxes,int nCol,
+                               std::vector<Rect>& Bbox, std::vector<float>& confidence, Size inputShape)
+    {
+        for(int k = 0; k < nbrTextBoxes; k++)
+        {
+            float x_min = buffer[k*nCol + 3]*inputShape.width;
+            float y_min = buffer[k*nCol + 4]*inputShape.height;
+
+            float x_max = buffer[k*nCol + 5]*inputShape.width;
+            float y_max = buffer[k*nCol + 6]*inputShape.height;
+
+            CV_Assert(x_min < x_max, y_min < y_max);
+
+            x_min = std::max(0.f, x_min);
+            y_min = std::max(0.f, y_min);
+
+            x_max = std::min(inputShape.width - 1.f,  x_max);
+            y_max = std::min(inputShape.height - 1.f,  y_max);
+
+            int wd = cvRound(x_max - x_min);
+            int ht = cvRound(y_max - y_min);
+
+            Bbox.push_back(Rect(cvRound(x_min), cvRound(y_min), wd, ht));
+            confidence.push_back(buffer[k*nCol + 2]);
+        }
+    }
+
+public:
+    TextDetectorCNNImpl(const String& modelArchFilename, const String& modelWeightsFilename, std::vector<Size> detectionSizes) :
+        sizes_(detectionSizes)
+    {
+        net_ = readNetFromCaffe(modelArchFilename, modelWeightsFilename);
+        CV_Assert(!net_.empty());
+        inputChannelCount_ = 3;
+    }
+
+    void detect(InputArray inputImage_, std::vector<Rect>& Bbox, std::vector<float>& confidence)
+    {
+        CV_Assert(inputImage_.channels() == inputChannelCount_);
+        Mat inputImage = inputImage_.getMat();
+        Bbox.resize(0);
+        confidence.resize(0);
+
+        for(size_t i = 0; i < sizes_.size(); i++)
+        {
+            Size inputGeometry = sizes_[i];
+            net_.setInput(blobFromImage(inputImage, 1, inputGeometry, Scalar(123, 117, 104), false, false), "data");
+            Mat outputNet = net_.forward();
+            int nbrTextBoxes = outputNet.size[2];
+            int nCol = outputNet.size[3];
+            int outputChannelCount = outputNet.size[1];
+            CV_Assert(outputChannelCount == 1);
+            getOutputs((float*)(outputNet.data), nbrTextBoxes, nCol, Bbox, confidence, inputImage.size());
+        }
+     }
+};
+
+Ptr<TextDetectorCNN> TextDetectorCNN::create(const String &modelArchFilename, const String &modelWeightsFilename, std::vector<Size> detectionSizes)
+{
+    return makePtr<TextDetectorCNNImpl>(modelArchFilename, modelWeightsFilename, detectionSizes);
+}
+
+Ptr<TextDetectorCNN> TextDetectorCNN::create(const String &modelArchFilename, const String &modelWeightsFilename)
+{
+    return create(modelArchFilename, modelWeightsFilename, std::vector<Size>(1, Size(300, 300)));
+}
+} //namespace text
+} //namespace cv