diff --git a/modules/text/README.md b/modules/text/README.md index 488518a28..b6955fd98 100644 --- a/modules/text/README.md +++ b/modules/text/README.md @@ -47,3 +47,12 @@ Notes 2. Tesseract configure script may fail to detect leptonica, so you may have to edit the configure script - comment off some if's around this message and retain only "then" branch. 3. You are encouraged to search the Net for some better pre-trained classifiers, as well as classifiers for other languages. + + +Text Detection CNN +================= + +Intro +----- + +The text module now have a text detection and recognition using deep CNN. The text detector deep CNN that takes an image which may contain multiple words. This outputs a list of Rects with bounding boxes and probability of text there. The text recognizer provides a probabillity over a given vocabulary for each of these rects. diff --git a/modules/text/cmake/FindTesseract.cmake b/modules/text/cmake/FindTesseract.cmake index 2a5d868f9..5bdbe2436 100644 --- a/modules/text/cmake/FindTesseract.cmake +++ b/modules/text/cmake/FindTesseract.cmake @@ -5,14 +5,17 @@ endif() if(NOT Tesseract_FOUND) find_path(Tesseract_INCLUDE_DIR tesseract/baseapi.h HINTS + /usr/include /usr/local/include) find_library(Tesseract_LIBRARY NAMES tesseract HINTS + /usr/lib /usr/local/lib) find_library(Lept_LIBRARY NAMES lept HINTS + /usr/lib /usr/local/lib) if(Tesseract_INCLUDE_DIR AND Tesseract_LIBRARY AND Lept_LIBRARY) diff --git a/modules/text/doc/text.bib b/modules/text/doc/text.bib index 64a8f4a19..d2ed9f9b6 100644 --- a/modules/text/doc/text.bib +++ b/modules/text/doc/text.bib @@ -31,4 +31,14 @@ journal = {CoRR}, volume = {abs/1407.7504}, year = {2014}, -} \ No newline at end of file +} +@inproceedings{LiaoSBWL17, + author = {Minghui Liao and + Baoguang Shi and + Xiang Bai and + Xinggang Wang and + Wenyu Liu}, + title = {TextBoxes: {A} Fast Text Detector with a Single Deep Neural Network}, + booktitle = {AAAI}, + year = {2017} +} diff --git a/modules/text/include/opencv2/text.hpp b/modules/text/include/opencv2/text.hpp index cea49c69c..c06c88983 100644 --- a/modules/text/include/opencv2/text.hpp +++ b/modules/text/include/opencv2/text.hpp @@ -41,6 +41,7 @@ the use of this software, even if advised of the possibility of such damage. #include "opencv2/text/erfilter.hpp" #include "opencv2/text/ocr.hpp" +#include "opencv2/text/textDetector.hpp" /** @defgroup text Scene Text Detection and Recognition diff --git a/modules/text/include/opencv2/text/ocr.hpp b/modules/text/include/opencv2/text/ocr.hpp index 645afeaef..c8e0129be 100644 --- a/modules/text/include/opencv2/text/ocr.hpp +++ b/modules/text/include/opencv2/text/ocr.hpp @@ -44,6 +44,8 @@ #ifndef __OPENCV_TEXT_OCR_HPP__ #define __OPENCV_TEXT_OCR_HPP__ +#include + #include #include diff --git a/modules/text/include/opencv2/text/textDetector.hpp b/modules/text/include/opencv2/text/textDetector.hpp new file mode 100644 index 000000000..fdb92fdfb --- /dev/null +++ b/modules/text/include/opencv2/text/textDetector.hpp @@ -0,0 +1,73 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef __OPENCV_TEXT_TEXTDETECTOR_HPP__ +#define __OPENCV_TEXT_TEXTDETECTOR_HPP__ + +#include"ocr.hpp" + +namespace cv +{ +namespace text +{ + +//! @addtogroup text_detect +//! @{ + +/** @brief An abstract class providing interface for text detection algorithms + */ +class CV_EXPORTS_W TextDetector +{ +public: + /** + @brief Method that provides a quick and simple interface to detect text inside an image + + @param inputImage an image to process + @param Bbox a vector of Rect that will store the detected word bounding box + @param confidence a vector of float that will be updated with the confidence the classifier has for the selected bounding box + */ + CV_WRAP virtual void detect(InputArray inputImage, CV_OUT std::vector& Bbox, CV_OUT std::vector& confidence) = 0; + virtual ~TextDetector() {} +}; + +/** @brief TextDetectorCNN class provides the functionallity of text bounding box detection. + This class is representing to find bounding boxes of text words given an input image. + This class uses OpenCV dnn module to load pre-trained model described in @cite LiaoSBWL17. + The original repository with the modified SSD Caffe version: https://github.com/MhLiao/TextBoxes. + Model can be downloaded from [DropBox](https://www.dropbox.com/s/g8pjzv2de9gty8g/TextBoxes_icdar13.caffemodel?dl=0). + Modified .prototxt file with the model description can be found in `opencv_contrib/modules/text/samples/textbox.prototxt`. + */ +class CV_EXPORTS_W TextDetectorCNN : public TextDetector +{ +public: + /** + @overload + + @param inputImage an image expected to be a CV_U8C3 of any size + @param Bbox a vector of Rect that will store the detected word bounding box + @param confidence a vector of float that will be updated with the confidence the classifier has for the selected bounding box + */ + CV_WRAP virtual void detect(InputArray inputImage, CV_OUT std::vector& Bbox, CV_OUT std::vector& confidence) = 0; + + /** @brief Creates an instance of the TextDetectorCNN class using the provided parameters. + + @param modelArchFilename the relative or absolute path to the prototxt file describing the classifiers architecture. + @param modelWeightsFilename the relative or absolute path to the file containing the pretrained weights of the model in caffe-binary form. + @param detectionSizes a list of sizes for multiscale detection. The values`[(300,300),(700,500),(700,300),(700,700),(1600,1600)]` are + recommended in @cite LiaoSBWL17 to achieve the best quality. + */ + static Ptr create(const String& modelArchFilename, const String& modelWeightsFilename, + std::vector detectionSizes); + /** + @overload + */ + CV_WRAP static Ptr create(const String& modelArchFilename, const String& modelWeightsFilename); +}; + +//! @} +}//namespace text +}//namespace cv + + +#endif // _OPENCV_TEXT_OCR_HPP_ diff --git a/modules/text/samples/deeptextdetection.py b/modules/text/samples/deeptextdetection.py new file mode 100644 index 000000000..256a28e9e --- /dev/null +++ b/modules/text/samples/deeptextdetection.py @@ -0,0 +1,37 @@ +# -*- coding: utf-8 -*- +#!/usr/bin/python +import sys +import os +import cv2 +import numpy as np + +def main(): + print('\nDeeptextdetection.py') + print(' A demo script of text box alogorithm of the paper:') + print(' * Minghui Liao et al.: TextBoxes: A Fast Text Detector with a Single Deep Neural Network https://arxiv.org/abs/1611.06779\n') + + if (len(sys.argv) < 2): + print(' (ERROR) You must call this script with an argument (path_to_image_to_be_processed)\n') + quit() + + if not os.path.isfile('TextBoxes_icdar13.caffemodel') or not os.path.isfile('textbox.prototxt'): + print " Model files not found in current directory. Aborting" + print " See the documentation of text::TextDetectorCNN class to get download links." + quit() + + img = cv2.imread(str(sys.argv[1])) + textSpotter = cv2.text.TextDetectorCNN_create("textbox.prototxt", "TextBoxes_icdar13.caffemodel") + rects, outProbs = textSpotter.detect(img); + vis = img.copy() + thres = 0.6 + + for r in range(np.shape(rects)[0]): + if outProbs[r] > thres: + rect = rects[r] + cv2.rectangle(vis, (rect[0],rect[1]), (rect[0] + rect[2], rect[1] + rect[3]), (255, 0, 0), 2) + + cv2.imshow("Text detection result", vis) + cv2.waitKey() + +if __name__ == "__main__": + main() diff --git a/modules/text/samples/dictnet_demo.cpp b/modules/text/samples/dictnet_demo.cpp index 277a1c9be..f70f2c175 100644 --- a/modules/text/samples/dictnet_demo.cpp +++ b/modules/text/samples/dictnet_demo.cpp @@ -1,12 +1,3 @@ -/* - * dictnet_demo.cpp - * - * Demonstrates simple use of the holistic word classifier in C++ - * - * Created on: June 26, 2016 - * Author: Anguelos Nicolaou - */ - #include "opencv2/text.hpp" #include "opencv2/highgui.hpp" #include "opencv2/imgproc.hpp" diff --git a/modules/text/samples/text_recognition_cnn.cpp b/modules/text/samples/text_recognition_cnn.cpp new file mode 100644 index 000000000..84df57d29 --- /dev/null +++ b/modules/text/samples/text_recognition_cnn.cpp @@ -0,0 +1,122 @@ +#include +#include +#include +#include + +#include +#include + +using namespace cv; +using namespace std; + +namespace +{ +void printHelpStr(const string& progFname) +{ + cout << " Demo of text recognition CNN for text detection." << endl + << " Max Jaderberg et al.: Reading Text in the Wild with Convolutional Neural Networks, IJCV 2015"< " << endl + << " Caffe Model files (textbox.prototxt, TextBoxes_icdar13.caffemodel)"<& groups, std::vector& probs, std::vector& indexes) +{ + for (size_t i = 0; i < indexes.size(); i++) + { + if (src.type() == CV_8UC3) + { + Rect currrentBox = groups[indexes[i]]; + rectangle(src, currrentBox, Scalar( 0, 255, 255 ), 2, LINE_AA); + String label = format("%.2f", probs[indexes[i]]); + std::cout << "text box: " << currrentBox << " confidence: " << probs[indexes[i]] << "\n"; + + int baseLine = 0; + Size labelSize = getTextSize(label, FONT_HERSHEY_PLAIN, 1, 1, &baseLine); + int yLeftBottom = std::max(currrentBox.y, labelSize.height); + rectangle(src, Point(currrentBox.x, yLeftBottom - labelSize.height), + Point(currrentBox.x + labelSize.width, yLeftBottom + baseLine), Scalar( 255, 255, 255 ), FILLED); + + putText(src, label, Point(currrentBox.x, yLeftBottom), FONT_HERSHEY_PLAIN, 1, Scalar( 0,0,0 ), 1, LINE_AA); + } + else + rectangle(src, groups[i], Scalar( 255 ), 3, 8 ); + } +} + +} + +int main(int argc, const char * argv[]) +{ + if (argc < 2) + { + printHelpStr(argv[0]); + cout << "Insufiecient parameters. Aborting!" << endl; + exit(1); + } + + const string modelArch = "textbox.prototxt"; + const string moddelWeights = "TextBoxes_icdar13.caffemodel"; + + if (!fileExists(modelArch) || !fileExists(moddelWeights)) + { + printHelpStr(argv[0]); + cout << "Model files not found in the current directory. Aborting!" << endl; + exit(1); + } + + Mat image = imread(String(argv[1]), IMREAD_COLOR); + + cout << "Starting Text Box Demo" << endl; + Ptr textSpotter = + text::TextDetectorCNN::create(modelArch, moddelWeights); + + vector bbox; + vector outProbabillities; + textSpotter->detect(image, bbox, outProbabillities); + std::vector indexes; + cv::dnn::NMSBoxes(bbox, outProbabillities, 0.4f, 0.5f, indexes); + + Mat image_copy = image.clone(); + textbox_draw(image_copy, bbox, outProbabillities, indexes); + imshow("Text detection", image_copy); + image_copy = image.clone(); + + Ptr wordSpotter = + text::OCRHolisticWordRecognizer::create("dictnet_vgg_deploy.prototxt", "dictnet_vgg.caffemodel", "dictnet_vgg_labels.txt"); + + for(size_t i = 0; i < indexes.size(); i++) + { + Mat wordImg; + cvtColor(image(bbox[indexes[i]]), wordImg, COLOR_BGR2GRAY); + string word; + vector confs; + wordSpotter->run(wordImg, word, NULL, NULL, &confs); + + Rect currrentBox = bbox[indexes[i]]; + rectangle(image_copy, currrentBox, Scalar( 0, 255, 255 ), 2, LINE_AA); + + int baseLine = 0; + Size labelSize = getTextSize(word, FONT_HERSHEY_PLAIN, 1, 1, &baseLine); + int yLeftBottom = std::max(currrentBox.y, labelSize.height); + rectangle(image_copy, Point(currrentBox.x, yLeftBottom - labelSize.height), + Point(currrentBox.x + labelSize.width, yLeftBottom + baseLine), Scalar( 255, 255, 255 ), FILLED); + + putText(image_copy, word, Point(currrentBox.x, yLeftBottom), FONT_HERSHEY_PLAIN, 1, Scalar( 0,0,0 ), 1, LINE_AA); + + } + imshow("Text recognition", image_copy); + cout << "Recognition finished. Press any key to exit.\n"; + waitKey(); + return 0; +} diff --git a/modules/text/samples/textbox.prototxt b/modules/text/samples/textbox.prototxt new file mode 100644 index 000000000..bb8019828 --- /dev/null +++ b/modules/text/samples/textbox.prototxt @@ -0,0 +1,1611 @@ +name: "VGG_text_longer_conv_300x300_deploy" +input: "data" +input_shape { + dim: 1 + dim: 3 + dim: 700 + dim: 700 +} +layer { + name: "conv1_1" + type: "Convolution" + bottom: "data" + top: "conv1_1" + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } + convolution_param { + num_output: 64 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "relu1_1" + type: "ReLU" + bottom: "conv1_1" + top: "conv1_1" +} +layer { + name: "conv1_2" + type: "Convolution" + bottom: "conv1_1" + top: "conv1_2" + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } + convolution_param { + num_output: 64 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "relu1_2" + type: "ReLU" + bottom: "conv1_2" + top: "conv1_2" +} +layer { + name: "pool1" + type: "Pooling" + bottom: "conv1_2" + top: "pool1" + pooling_param { + pool: MAX + kernel_size: 2 + stride: 2 + } +} +layer { + name: "conv2_1" + type: "Convolution" + bottom: "pool1" + top: "conv2_1" + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "relu2_1" + type: "ReLU" + bottom: "conv2_1" + top: "conv2_1" +} +layer { + name: "conv2_2" + type: "Convolution" + bottom: "conv2_1" + top: "conv2_2" + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "relu2_2" + type: "ReLU" + bottom: "conv2_2" + top: "conv2_2" +} +layer { + name: "pool2" + type: "Pooling" + bottom: "conv2_2" + top: "pool2" + pooling_param { + pool: MAX + kernel_size: 2 + stride: 2 + } +} +layer { + name: "conv3_1" + type: "Convolution" + bottom: "pool2" + top: "conv3_1" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "relu3_1" + type: "ReLU" + bottom: "conv3_1" + top: "conv3_1" +} +layer { + name: "conv3_2" + type: "Convolution" + bottom: "conv3_1" + top: "conv3_2" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "relu3_2" + type: "ReLU" + bottom: "conv3_2" + top: "conv3_2" +} +layer { + name: "conv3_3" + type: "Convolution" + bottom: "conv3_2" + top: "conv3_3" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "relu3_3" + type: "ReLU" + bottom: "conv3_3" + top: "conv3_3" +} +layer { + name: "pool3" + type: "Pooling" + bottom: "conv3_3" + top: "pool3" + pooling_param { + pool: MAX + kernel_size: 2 + stride: 2 + } +} +layer { + name: "conv4_1" + type: "Convolution" + bottom: "pool3" + top: "conv4_1" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 512 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "relu4_1" + type: "ReLU" + bottom: "conv4_1" + top: "conv4_1" +} +layer { + name: "conv4_2" + type: "Convolution" + bottom: "conv4_1" + top: "conv4_2" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 512 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "relu4_2" + type: "ReLU" + bottom: "conv4_2" + top: "conv4_2" +} +layer { + name: "conv4_3" + type: "Convolution" + bottom: "conv4_2" + top: "conv4_3" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 512 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "relu4_3" + type: "ReLU" + bottom: "conv4_3" + top: "conv4_3" +} +layer { + name: "pool4" + type: "Pooling" + bottom: "conv4_3" + top: "pool4" + pooling_param { + pool: MAX + kernel_size: 2 + stride: 2 + } +} +layer { + name: "conv5_1" + type: "Convolution" + bottom: "pool4" + top: "conv5_1" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 512 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "relu5_1" + type: "ReLU" + bottom: "conv5_1" + top: "conv5_1" +} +layer { + name: "conv5_2" + type: "Convolution" + bottom: "conv5_1" + top: "conv5_2" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 512 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "relu5_2" + type: "ReLU" + bottom: "conv5_2" + top: "conv5_2" +} +layer { + name: "conv5_3" + type: "Convolution" + bottom: "conv5_2" + top: "conv5_3" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 512 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "relu5_3" + type: "ReLU" + bottom: "conv5_3" + top: "conv5_3" +} +layer { + name: "pool5" + type: "Pooling" + bottom: "conv5_3" + top: "pool5" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 1 + pad: 1 + } +} +layer { + name: "fc6" + type: "Convolution" + bottom: "pool5" + top: "fc6" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 1024 + pad: 6 + kernel_size: 3 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + dilation: 6 + } +} +layer { + name: "relu6" + type: "ReLU" + bottom: "fc6" + top: "fc6" +} +layer { + name: "fc7" + type: "Convolution" + bottom: "fc6" + top: "fc7" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 1024 + kernel_size: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "relu7" + type: "ReLU" + bottom: "fc7" + top: "fc7" +} +layer { + name: "conv6_1" + type: "Convolution" + bottom: "fc7" + top: "conv6_1" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 256 + pad: 0 + kernel_size: 1 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv6_1_relu" + type: "ReLU" + bottom: "conv6_1" + top: "conv6_1" +} +layer { + name: "conv6_2" + type: "Convolution" + bottom: "conv6_1" + top: "conv6_2" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 512 + pad: 1 + kernel_size: 3 + stride: 2 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv6_2_relu" + type: "ReLU" + bottom: "conv6_2" + top: "conv6_2" +} +layer { + name: "conv7_1" + type: "Convolution" + bottom: "conv6_2" + top: "conv7_1" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 0 + kernel_size: 1 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv7_1_relu" + type: "ReLU" + bottom: "conv7_1" + top: "conv7_1" +} +layer { + name: "conv7_2" + type: "Convolution" + bottom: "conv7_1" + top: "conv7_2" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + stride: 2 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv7_2_relu" + type: "ReLU" + bottom: "conv7_2" + top: "conv7_2" +} +layer { + name: "conv8_1" + type: "Convolution" + bottom: "conv7_2" + top: "conv8_1" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 0 + kernel_size: 1 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv8_1_relu" + type: "ReLU" + bottom: "conv8_1" + top: "conv8_1" +} +layer { + name: "conv8_2" + type: "Convolution" + bottom: "conv8_1" + top: "conv8_2" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + stride: 2 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv8_2_relu" + type: "ReLU" + bottom: "conv8_2" + top: "conv8_2" +} +layer { + name: "pool6" + type: "Pooling" + bottom: "conv8_2" + top: "pool6" + pooling_param { + pool: AVE + global_pooling: true + } +} +layer { + name: "conv4_3_norm" + type: "Normalize" + bottom: "conv4_3" + top: "conv4_3_norm" + norm_param { + across_spatial: false + scale_filler { + type: "constant" + value: 20 + } + channel_shared: false + } +} +layer { + name: "conv4_3_norm_mbox_loc" + type: "Convolution" + bottom: "conv4_3_norm" + top: "conv4_3_norm_mbox_loc" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 48 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + pad_h: 0 + pad_w: 2 + kernel_h: 1 + kernel_w: 5 + stride_h: 1 + stride_w: 1 + } +} +layer { + name: "conv4_3_norm_mbox_loc_perm" + type: "Permute" + bottom: "conv4_3_norm_mbox_loc" + top: "conv4_3_norm_mbox_loc_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv4_3_norm_mbox_loc_flat" + type: "Flatten" + bottom: "conv4_3_norm_mbox_loc_perm" + top: "conv4_3_norm_mbox_loc_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv4_3_norm_mbox_conf" + type: "Convolution" + bottom: "conv4_3_norm" + top: "conv4_3_norm_mbox_conf" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 24 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + pad_h: 0 + pad_w: 2 + kernel_h: 1 + kernel_w: 5 + stride_h: 1 + stride_w: 1 + } +} +layer { + name: "conv4_3_norm_mbox_conf_perm" + type: "Permute" + bottom: "conv4_3_norm_mbox_conf" + top: "conv4_3_norm_mbox_conf_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv4_3_norm_mbox_conf_flat" + type: "Flatten" + bottom: "conv4_3_norm_mbox_conf_perm" + top: "conv4_3_norm_mbox_conf_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv4_3_norm_mbox_priorbox" + type: "PriorBox" + bottom: "conv4_3_norm" + bottom: "data" + top: "conv4_3_norm_mbox_priorbox" + prior_box_param { + min_size: 30.0 + aspect_ratio: 2 + aspect_ratio: 3 + aspect_ratio: 5 + aspect_ratio: 7 + aspect_ratio: 10 + flip: false + clip: true + variance: 0.1 + variance: 0.1 + variance: 0.2 + variance: 0.2 + additional_y_offset: true + } +} +layer { + name: "fc7_mbox_loc" + type: "Convolution" + bottom: "fc7" + top: "fc7_mbox_loc" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 56 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + pad_h: 0 + pad_w: 2 + kernel_h: 1 + kernel_w: 5 + stride_h: 1 + stride_w: 1 + } +} +layer { + name: "fc7_mbox_loc_perm" + type: "Permute" + bottom: "fc7_mbox_loc" + top: "fc7_mbox_loc_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "fc7_mbox_loc_flat" + type: "Flatten" + bottom: "fc7_mbox_loc_perm" + top: "fc7_mbox_loc_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "fc7_mbox_conf" + type: "Convolution" + bottom: "fc7" + top: "fc7_mbox_conf" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 28 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + pad_h: 0 + pad_w: 2 + kernel_h: 1 + kernel_w: 5 + stride_h: 1 + stride_w: 1 + } +} +layer { + name: "fc7_mbox_conf_perm" + type: "Permute" + bottom: "fc7_mbox_conf" + top: "fc7_mbox_conf_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "fc7_mbox_conf_flat" + type: "Flatten" + bottom: "fc7_mbox_conf_perm" + top: "fc7_mbox_conf_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "fc7_mbox_priorbox" + type: "PriorBox" + bottom: "fc7" + bottom: "data" + top: "fc7_mbox_priorbox" + prior_box_param { + min_size: 60.0 + max_size: 114.0 + aspect_ratio: 2 + aspect_ratio: 3 + aspect_ratio: 5 + aspect_ratio: 7 + aspect_ratio: 10 + flip: false + clip: true + variance: 0.1 + variance: 0.1 + variance: 0.2 + variance: 0.2 + additional_y_offset: true + } +} +layer { + name: "conv6_2_mbox_loc" + type: "Convolution" + bottom: "conv6_2" + top: "conv6_2_mbox_loc" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 56 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + pad_h: 0 + pad_w: 2 + kernel_h: 1 + kernel_w: 5 + stride_h: 1 + stride_w: 1 + } +} +layer { + name: "conv6_2_mbox_loc_perm" + type: "Permute" + bottom: "conv6_2_mbox_loc" + top: "conv6_2_mbox_loc_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv6_2_mbox_loc_flat" + type: "Flatten" + bottom: "conv6_2_mbox_loc_perm" + top: "conv6_2_mbox_loc_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv6_2_mbox_conf" + type: "Convolution" + bottom: "conv6_2" + top: "conv6_2_mbox_conf" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 28 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + pad_h: 0 + pad_w: 2 + kernel_h: 1 + kernel_w: 5 + stride_h: 1 + stride_w: 1 + } +} +layer { + name: "conv6_2_mbox_conf_perm" + type: "Permute" + bottom: "conv6_2_mbox_conf" + top: "conv6_2_mbox_conf_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv6_2_mbox_conf_flat" + type: "Flatten" + bottom: "conv6_2_mbox_conf_perm" + top: "conv6_2_mbox_conf_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv6_2_mbox_priorbox" + type: "PriorBox" + bottom: "conv6_2" + bottom: "data" + top: "conv6_2_mbox_priorbox" + prior_box_param { + min_size: 114.0 + max_size: 168.0 + aspect_ratio: 2 + aspect_ratio: 3 + aspect_ratio: 5 + aspect_ratio: 7 + aspect_ratio: 10 + flip: false + clip: true + variance: 0.1 + variance: 0.1 + variance: 0.2 + variance: 0.2 + additional_y_offset: true + } +} +layer { + name: "conv7_2_mbox_loc" + type: "Convolution" + bottom: "conv7_2" + top: "conv7_2_mbox_loc" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 56 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + pad_h: 0 + pad_w: 2 + kernel_h: 1 + kernel_w: 5 + stride_h: 1 + stride_w: 1 + } +} +layer { + name: "conv7_2_mbox_loc_perm" + type: "Permute" + bottom: "conv7_2_mbox_loc" + top: "conv7_2_mbox_loc_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv7_2_mbox_loc_flat" + type: "Flatten" + bottom: "conv7_2_mbox_loc_perm" + top: "conv7_2_mbox_loc_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv7_2_mbox_conf" + type: "Convolution" + bottom: "conv7_2" + top: "conv7_2_mbox_conf" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 28 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + pad_h: 0 + pad_w: 2 + kernel_h: 1 + kernel_w: 5 + stride_h: 1 + stride_w: 1 + } +} +layer { + name: "conv7_2_mbox_conf_perm" + type: "Permute" + bottom: "conv7_2_mbox_conf" + top: "conv7_2_mbox_conf_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv7_2_mbox_conf_flat" + type: "Flatten" + bottom: "conv7_2_mbox_conf_perm" + top: "conv7_2_mbox_conf_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv7_2_mbox_priorbox" + type: "PriorBox" + bottom: "conv7_2" + bottom: "data" + top: "conv7_2_mbox_priorbox" + prior_box_param { + min_size: 168.0 + max_size: 222.0 + aspect_ratio: 2 + aspect_ratio: 3 + aspect_ratio: 5 + aspect_ratio: 7 + aspect_ratio: 10 + flip: false + clip: true + variance: 0.1 + variance: 0.1 + variance: 0.2 + variance: 0.2 + additional_y_offset: true + } +} +layer { + name: "conv8_2_mbox_loc" + type: "Convolution" + bottom: "conv8_2" + top: "conv8_2_mbox_loc" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 56 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + pad_h: 0 + pad_w: 2 + kernel_h: 1 + kernel_w: 5 + stride_h: 1 + stride_w: 1 + } +} +layer { + name: "conv8_2_mbox_loc_perm" + type: "Permute" + bottom: "conv8_2_mbox_loc" + top: "conv8_2_mbox_loc_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv8_2_mbox_loc_flat" + type: "Flatten" + bottom: "conv8_2_mbox_loc_perm" + top: "conv8_2_mbox_loc_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv8_2_mbox_conf" + type: "Convolution" + bottom: "conv8_2" + top: "conv8_2_mbox_conf" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 28 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + pad_h: 0 + pad_w: 2 + kernel_h: 1 + kernel_w: 5 + stride_h: 1 + stride_w: 1 + } +} +layer { + name: "conv8_2_mbox_conf_perm" + type: "Permute" + bottom: "conv8_2_mbox_conf" + top: "conv8_2_mbox_conf_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv8_2_mbox_conf_flat" + type: "Flatten" + bottom: "conv8_2_mbox_conf_perm" + top: "conv8_2_mbox_conf_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv8_2_mbox_priorbox" + type: "PriorBox" + bottom: "conv8_2" + bottom: "data" + top: "conv8_2_mbox_priorbox" + prior_box_param { + min_size: 222.0 + max_size: 276.0 + aspect_ratio: 2 + aspect_ratio: 3 + aspect_ratio: 5 + aspect_ratio: 7 + aspect_ratio: 10 + flip: false + clip: true + variance: 0.1 + variance: 0.1 + variance: 0.2 + variance: 0.2 + additional_y_offset: true + } +} +layer { + name: "pool6_mbox_loc" + type: "Convolution" + bottom: "pool6" + top: "pool6_mbox_loc" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 56 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + pad_h: 0 + pad_w: 2 + kernel_h: 1 + kernel_w: 5 + stride_h: 1 + stride_w: 1 + } +} +layer { + name: "pool6_mbox_loc_perm" + type: "Permute" + bottom: "pool6_mbox_loc" + top: "pool6_mbox_loc_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "pool6_mbox_loc_flat" + type: "Flatten" + bottom: "pool6_mbox_loc_perm" + top: "pool6_mbox_loc_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "pool6_mbox_conf" + type: "Convolution" + bottom: "pool6" + top: "pool6_mbox_conf" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 28 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + pad_h: 0 + pad_w: 2 + kernel_h: 1 + kernel_w: 5 + stride_h: 1 + stride_w: 1 + } +} +layer { + name: "pool6_mbox_conf_perm" + type: "Permute" + bottom: "pool6_mbox_conf" + top: "pool6_mbox_conf_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "pool6_mbox_conf_flat" + type: "Flatten" + bottom: "pool6_mbox_conf_perm" + top: "pool6_mbox_conf_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "pool6_mbox_priorbox" + type: "PriorBox" + bottom: "pool6" + bottom: "data" + top: "pool6_mbox_priorbox" + prior_box_param { + min_size: 276.0 + max_size: 330.0 + aspect_ratio: 2 + aspect_ratio: 3 + aspect_ratio: 5 + aspect_ratio: 7 + aspect_ratio: 10 + flip: false + clip: true + variance: 0.1 + variance: 0.1 + variance: 0.2 + variance: 0.2 + additional_y_offset: true + } +} +layer { + name: "mbox_loc" + type: "Concat" + bottom: "conv4_3_norm_mbox_loc_flat" + bottom: "fc7_mbox_loc_flat" + bottom: "conv6_2_mbox_loc_flat" + bottom: "conv7_2_mbox_loc_flat" + bottom: "conv8_2_mbox_loc_flat" + bottom: "pool6_mbox_loc_flat" + top: "mbox_loc" + concat_param { + axis: 1 + } +} +layer { + name: "mbox_conf" + type: "Concat" + bottom: "conv4_3_norm_mbox_conf_flat" + bottom: "fc7_mbox_conf_flat" + bottom: "conv6_2_mbox_conf_flat" + bottom: "conv7_2_mbox_conf_flat" + bottom: "conv8_2_mbox_conf_flat" + bottom: "pool6_mbox_conf_flat" + top: "mbox_conf" + concat_param { + axis: 1 + } +} +layer { + name: "mbox_priorbox" + type: "Concat" + bottom: "conv4_3_norm_mbox_priorbox" + bottom: "fc7_mbox_priorbox" + bottom: "conv6_2_mbox_priorbox" + bottom: "conv7_2_mbox_priorbox" + bottom: "conv8_2_mbox_priorbox" + bottom: "pool6_mbox_priorbox" + top: "mbox_priorbox" + concat_param { + axis: 2 + } +} +layer { + name: "mbox_conf_reshape" + type: "Reshape" + bottom: "mbox_conf" + top: "mbox_conf_reshape" + reshape_param { + shape { + dim: 0 + dim: -1 + dim: 2 + } + } +} +layer { + name: "mbox_conf_softmax" + type: "Softmax" + bottom: "mbox_conf_reshape" + top: "mbox_conf_softmax" + softmax_param { + axis: 2 + } +} +layer { + name: "mbox_conf_flatten" + type: "Flatten" + bottom: "mbox_conf_softmax" + top: "mbox_conf_flatten" + flatten_param { + axis: 1 + } +} +layer { + name: "detection_out" + type: "DetectionOutput" + bottom: "mbox_loc" + bottom: "mbox_conf_flatten" + bottom: "mbox_priorbox" + top: "detection_out" + include { + phase: TEST + } + detection_output_param { + num_classes: 2 + share_location: true + background_label_id: 0 + nms_param { + nms_threshold: 0.45 + top_k: 400 + } + code_type: CENTER_SIZE + keep_top_k: 200 + confidence_threshold: 0.01 + } +} diff --git a/modules/text/samples/textbox_demo.cpp b/modules/text/samples/textbox_demo.cpp new file mode 100644 index 000000000..1cf9a9aab --- /dev/null +++ b/modules/text/samples/textbox_demo.cpp @@ -0,0 +1,96 @@ +#include +#include +#include +#include + +#include +#include +#include + +using namespace cv; + +namespace +{ +std::string getHelpStr(const std::string& progFname) +{ + std::stringstream out; + out << " Demo of text detection CNN for text detection." << std::endl + << " Minghui Liao, Baoguang Shi, Xiang Bai, Xinggang Wang, Wenyu Liu: TextBoxes: A Fast Text Detector with a Single Deep Neural Network, AAAI2017\n\n" + << " Usage: " << progFname << " " << std::endl + << " Caffe Model files (textbox.prototxt, TextBoxes_icdar13.caffemodel)"<& groups, std::vector& probs, std::vector& indexes) +{ + for (size_t i = 0; i < indexes.size(); i++) + { + if (src.type() == CV_8UC3) + { + Rect currrentBox = groups[indexes[i]]; + rectangle(src, currrentBox, Scalar( 0, 255, 255 ), 2, LINE_AA); + String label = format("%.2f", probs[indexes[i]]); + std::cout << "text box: " << currrentBox << " confidence: " << probs[indexes[i]] << "\n"; + + int baseLine = 0; + Size labelSize = getTextSize(label, FONT_HERSHEY_PLAIN, 1, 1, &baseLine); + int yLeftBottom = std::max(currrentBox.y, labelSize.height); + rectangle(src, Point(currrentBox.x, yLeftBottom - labelSize.height), + Point(currrentBox.x + labelSize.width, yLeftBottom + baseLine), Scalar( 255, 255, 255 ), FILLED); + + putText(src, label, Point(currrentBox.x, yLeftBottom), FONT_HERSHEY_PLAIN, 1, Scalar( 0,0,0 ), 1, LINE_AA); + } + else + rectangle(src, groups[i], Scalar( 255 ), 3, 8 ); + } +} + +} + +int main(int argc, const char * argv[]) +{ + if (argc < 2) + { + std::cout << getHelpStr(argv[0]); + std::cout << "Insufiecient parameters. Aborting!" << std::endl; + exit(1); + } + + const std::string modelArch = "textbox.prototxt"; + const std::string moddelWeights = "TextBoxes_icdar13.caffemodel"; + + if (!fileExists(modelArch) || !fileExists(moddelWeights)) + { + std::cout << getHelpStr(argv[0]); + std::cout << "Model files not found in the current directory. Aborting!" << std::endl; + exit(1); + } + + Mat image = imread(String(argv[1]), IMREAD_COLOR); + + std::cout << "Starting Text Box Demo" << std::endl; + Ptr textSpotter = + text::TextDetectorCNN::create(modelArch, moddelWeights); + + std::vector bbox; + std::vector outProbabillities; + textSpotter->detect(image, bbox, outProbabillities); + + std::vector indexes; + cv::dnn::NMSBoxes(bbox, outProbabillities, 0.3f, 0.4f, indexes); + + textbox_draw(image, bbox, outProbabillities, indexes); + + imshow("TextBox Demo",image); + std::cout << "Done!" << std::endl << std::endl; + std::cout << "Press any key to exit." << std::endl << std::endl; + waitKey(); + return 0; +} diff --git a/modules/text/src/ocr_holistic.cpp b/modules/text/src/ocr_holistic.cpp index 77016edf2..07c4aa98f 100644 --- a/modules/text/src/ocr_holistic.cpp +++ b/modules/text/src/ocr_holistic.cpp @@ -1,3 +1,7 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + #include "precomp.hpp" #include "opencv2/imgproc.hpp" #include "opencv2/core.hpp" diff --git a/modules/text/src/text_detectorCNN.cpp b/modules/text/src/text_detectorCNN.cpp new file mode 100644 index 000000000..84f769b42 --- /dev/null +++ b/modules/text/src/text_detectorCNN.cpp @@ -0,0 +1,94 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#include "precomp.hpp" +#include "opencv2/imgproc.hpp" +#include "opencv2/core.hpp" +#include "opencv2/dnn.hpp" + +#include +#include + +using namespace cv::dnn; + +namespace cv +{ +namespace text +{ + +class TextDetectorCNNImpl : public TextDetectorCNN +{ +protected: + Net net_; + std::vector sizes_; + int inputChannelCount_; + + void getOutputs(const float* buffer,int nbrTextBoxes,int nCol, + std::vector& Bbox, std::vector& confidence, Size inputShape) + { + for(int k = 0; k < nbrTextBoxes; k++) + { + float x_min = buffer[k*nCol + 3]*inputShape.width; + float y_min = buffer[k*nCol + 4]*inputShape.height; + + float x_max = buffer[k*nCol + 5]*inputShape.width; + float y_max = buffer[k*nCol + 6]*inputShape.height; + + CV_Assert(x_min < x_max, y_min < y_max); + + x_min = std::max(0.f, x_min); + y_min = std::max(0.f, y_min); + + x_max = std::min(inputShape.width - 1.f, x_max); + y_max = std::min(inputShape.height - 1.f, y_max); + + int wd = cvRound(x_max - x_min); + int ht = cvRound(y_max - y_min); + + Bbox.push_back(Rect(cvRound(x_min), cvRound(y_min), wd, ht)); + confidence.push_back(buffer[k*nCol + 2]); + } + } + +public: + TextDetectorCNNImpl(const String& modelArchFilename, const String& modelWeightsFilename, std::vector detectionSizes) : + sizes_(detectionSizes) + { + net_ = readNetFromCaffe(modelArchFilename, modelWeightsFilename); + CV_Assert(!net_.empty()); + inputChannelCount_ = 3; + } + + void detect(InputArray inputImage_, std::vector& Bbox, std::vector& confidence) + { + CV_Assert(inputImage_.channels() == inputChannelCount_); + Mat inputImage = inputImage_.getMat(); + Bbox.resize(0); + confidence.resize(0); + + for(size_t i = 0; i < sizes_.size(); i++) + { + Size inputGeometry = sizes_[i]; + net_.setInput(blobFromImage(inputImage, 1, inputGeometry, Scalar(123, 117, 104), false, false), "data"); + Mat outputNet = net_.forward(); + int nbrTextBoxes = outputNet.size[2]; + int nCol = outputNet.size[3]; + int outputChannelCount = outputNet.size[1]; + CV_Assert(outputChannelCount == 1); + getOutputs((float*)(outputNet.data), nbrTextBoxes, nCol, Bbox, confidence, inputImage.size()); + } + } +}; + +Ptr TextDetectorCNN::create(const String &modelArchFilename, const String &modelWeightsFilename, std::vector detectionSizes) +{ + return makePtr(modelArchFilename, modelWeightsFilename, detectionSizes); +} + +Ptr TextDetectorCNN::create(const String &modelArchFilename, const String &modelWeightsFilename) +{ + return create(modelArchFilename, modelWeightsFilename, std::vector(1, Size(300, 300))); +} +} //namespace text +} //namespace cv