Merge pull request #1399 from sovrasov:text_detector_dnn

8 years ago · 6651fb0b45
parent e85a802a90 fd2e37da56
commit 6651fb0b45
13 changed files with 2063 additions and 10 deletions
--- a/modules/text/README.md
+++ b/modules/text/README.md
@ -47,3 +47,12 @@ Notes
 2. Tesseract configure script may fail to detect leptonica, so you may have to edit the configure script - comment off some if's around this message and retain only "then" branch.

 3. You are encouraged to search the Net for some better pre-trained classifiers, as well as classifiers for other languages.
+
+
+Text Detection CNN
+=================
+
+Intro
+-----
+
+The text module now have a text detection and recognition using deep CNN. The text detector deep CNN that takes an image which may contain multiple words. This outputs a list of Rects with bounding boxes and probability of text there. The text recognizer provides a probabillity over a given vocabulary for each of these rects.
--- a/modules/text/cmake/FindTesseract.cmake
+++ b/modules/text/cmake/FindTesseract.cmake
@ -5,14 +5,17 @@ endif()
 if(NOT Tesseract_FOUND)
  find_path(Tesseract_INCLUDE_DIR tesseract/baseapi.h
    HINTS
+    /usr/include
    /usr/local/include)

  find_library(Tesseract_LIBRARY NAMES tesseract
    HINTS
+    /usr/lib
    /usr/local/lib)

  find_library(Lept_LIBRARY NAMES lept
    HINTS
+    /usr/lib
    /usr/local/lib)

  if(Tesseract_INCLUDE_DIR AND Tesseract_LIBRARY AND Lept_LIBRARY)
--- a/modules/text/doc/text.bib
+++ b/modules/text/doc/text.bib
@ -31,4 +31,14 @@
  journal   = {CoRR},
  volume    = {abs/1407.7504},
  year      = {2014},
-}
+}
+@inproceedings{LiaoSBWL17,
+  author    = {Minghui Liao and
+               Baoguang Shi and
+               Xiang Bai and
+               Xinggang Wang and
+               Wenyu Liu},
+  title     = {TextBoxes: {A} Fast Text Detector with a Single Deep Neural Network},
+  booktitle = {AAAI},
+  year      = {2017}
+}
--- a/modules/text/include/opencv2/text.hpp
+++ b/modules/text/include/opencv2/text.hpp
@ -41,6 +41,7 @@ the use of this software, even if advised of the possibility of such damage.

 #include "opencv2/text/erfilter.hpp"
 #include "opencv2/text/ocr.hpp"
+#include "opencv2/text/textDetector.hpp"

 /** @defgroup text Scene Text Detection and Recognition

--- a/modules/text/include/opencv2/text/ocr.hpp
+++ b/modules/text/include/opencv2/text/ocr.hpp
@ -44,6 +44,8 @@
 #ifndef __OPENCV_TEXT_OCR_HPP__
 #define __OPENCV_TEXT_OCR_HPP__

+#include <opencv2/core.hpp>
+
 #include <vector>
 #include <string>

--- a/modules/text/include/opencv2/text/textDetector.hpp
+++ b/modules/text/include/opencv2/text/textDetector.hpp
@ -0,0 +1,73 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef __OPENCV_TEXT_TEXTDETECTOR_HPP__
+#define __OPENCV_TEXT_TEXTDETECTOR_HPP__
+
+#include"ocr.hpp"
+
+namespace cv
+{
+namespace text
+{
+
+//! @addtogroup text_detect
+//! @{
+
+/** @brief An abstract class providing interface for text detection algorithms
+ */
+class CV_EXPORTS_W TextDetector
+{
+public:
+    /**
+    @brief Method that provides a quick and simple interface to detect text inside an image
+
+    @param inputImage an image to process
+    @param Bbox a vector of Rect that will store the detected word bounding box
+    @param confidence a vector of float that will be updated with the confidence the classifier has for the selected bounding box
+    */
+    CV_WRAP virtual void detect(InputArray inputImage, CV_OUT std::vector<Rect>& Bbox, CV_OUT std::vector<float>& confidence) = 0;
+    virtual ~TextDetector() {}
+};
+
+/** @brief TextDetectorCNN class provides the functionallity of text bounding box detection.
+ This class is representing to find bounding boxes of text words given an input image.
+ This class uses OpenCV dnn module to load pre-trained model described in @cite LiaoSBWL17.
+ The original repository with the modified SSD Caffe version: https://github.com/MhLiao/TextBoxes.
+ Model can be downloaded from [DropBox](https://www.dropbox.com/s/g8pjzv2de9gty8g/TextBoxes_icdar13.caffemodel?dl=0).
+ Modified .prototxt file with the model description can be found in `opencv_contrib/modules/text/samples/textbox.prototxt`.
+ */
+class CV_EXPORTS_W TextDetectorCNN : public TextDetector
+{
+public:
+    /**
+    @overload
+
+    @param inputImage an image expected to be a CV_U8C3 of any size
+    @param Bbox a vector of Rect that will store the detected word bounding box
+    @param confidence a vector of float that will be updated with the confidence the classifier has for the selected bounding box
+    */
+    CV_WRAP virtual void detect(InputArray inputImage, CV_OUT std::vector<Rect>& Bbox, CV_OUT std::vector<float>& confidence) = 0;
+
+    /** @brief Creates an instance of the TextDetectorCNN class using the provided parameters.
+
+    @param modelArchFilename the relative or absolute path to the prototxt file describing the classifiers architecture.
+    @param modelWeightsFilename the relative or absolute path to the file containing the pretrained weights of the model in caffe-binary form.
+    @param detectionSizes a list of sizes for multiscale detection. The values`[(300,300),(700,500),(700,300),(700,700),(1600,1600)]` are
+    recommended in @cite LiaoSBWL17 to achieve the best quality.
+    */
+    static Ptr<TextDetectorCNN> create(const String& modelArchFilename, const String& modelWeightsFilename,
+                                               std::vector<Size> detectionSizes);
+    /**
+      @overload
+    */
+    CV_WRAP static Ptr<TextDetectorCNN> create(const String& modelArchFilename, const String& modelWeightsFilename);
+};
+
+//! @}
+}//namespace text
+}//namespace cv
+
+
+#endif // _OPENCV_TEXT_OCR_HPP_
--- a/modules/text/samples/deeptextdetection.py
+++ b/modules/text/samples/deeptextdetection.py
@ -0,0 +1,37 @@
+# -*- coding: utf-8 -*-
+#!/usr/bin/python
+import sys
+import os
+import cv2
+import numpy as np
+
+def main():
+    print('\nDeeptextdetection.py')
+    print('       A demo script of text box alogorithm of the paper:')
+    print('       * Minghui Liao et al.: TextBoxes: A Fast Text Detector with a Single Deep Neural Network https://arxiv.org/abs/1611.06779\n')
+
+    if (len(sys.argv) < 2):
+        print(' (ERROR) You must call this script with an argument (path_to_image_to_be_processed)\n')
+        quit()
+
+    if not os.path.isfile('TextBoxes_icdar13.caffemodel') or not os.path.isfile('textbox.prototxt'):
+        print " Model files not found in current directory. Aborting"
+        print " See the documentation of text::TextDetectorCNN class to get download links."
+        quit()
+
+    img = cv2.imread(str(sys.argv[1]))
+    textSpotter = cv2.text.TextDetectorCNN_create("textbox.prototxt", "TextBoxes_icdar13.caffemodel")
+    rects, outProbs = textSpotter.detect(img);
+    vis = img.copy()
+    thres = 0.6
+
+    for r in range(np.shape(rects)[0]):
+        if outProbs[r] > thres:
+            rect = rects[r]
+            cv2.rectangle(vis, (rect[0],rect[1]), (rect[0] + rect[2], rect[1] + rect[3]), (255, 0, 0), 2)
+
+    cv2.imshow("Text detection result", vis)
+    cv2.waitKey()
+
+if __name__ == "__main__":
+    main()
--- a/modules/text/samples/dictnet_demo.cpp
+++ b/modules/text/samples/dictnet_demo.cpp
@ -1,12 +1,3 @@
-/*
- * dictnet_demo.cpp
- *
- * Demonstrates simple use of the holistic word classifier in C++
- *
- * Created on: June 26, 2016
- *     Author: Anguelos Nicolaou <anguelos.nicolaou AT gmail.com>
- */
-
 #include  "opencv2/text.hpp"
 #include  "opencv2/highgui.hpp"
 #include  "opencv2/imgproc.hpp"
--- a/modules/text/samples/text_recognition_cnn.cpp
+++ b/modules/text/samples/text_recognition_cnn.cpp
@ -0,0 +1,122 @@
+#include <opencv2/text.hpp>
+#include <opencv2/highgui.hpp>
+#include <opencv2/imgproc.hpp>
+#include <opencv2/dnn.hpp>
+
+#include  <iostream>
+#include  <fstream>
+
+using namespace cv;
+using namespace std;
+
+namespace
+{
+void printHelpStr(const string& progFname)
+{
+    cout << "   Demo of text recognition CNN for text detection." << endl
+         << "   Max Jaderberg et al.: Reading Text in the Wild with Convolutional Neural Networks, IJCV 2015"<<endl<<endl
+         << "   Usage: " << progFname << " <output_file> <input_image>" << endl
+         << "   Caffe Model files (textbox.prototxt, TextBoxes_icdar13.caffemodel)"<<endl
+         << "     must be in the current directory. See the documentation of text::TextDetectorCNN class to get download links." << endl
+         << "   Obtaining text recognition Caffe Model files in linux shell:" << endl
+         << "   wget http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg.caffemodel" << endl
+         << "   wget http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg_deploy.prototxt" << endl
+         << "   wget http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg_labels.txt" <<endl << endl;
+}
+
+bool fileExists (const string& filename)
+{
+    ifstream f(filename.c_str());
+    return f.good();
+}
+
+void textbox_draw(Mat src, std::vector<Rect>& groups, std::vector<float>& probs, std::vector<int>& indexes)
+{
+    for (size_t i = 0; i < indexes.size(); i++)
+    {
+        if (src.type() == CV_8UC3)
+        {
+            Rect currrentBox = groups[indexes[i]];
+            rectangle(src, currrentBox, Scalar( 0, 255, 255 ), 2, LINE_AA);
+            String label = format("%.2f", probs[indexes[i]]);
+            std::cout << "text box: " << currrentBox << " confidence: " << probs[indexes[i]] << "\n";
+
+            int baseLine = 0;
+            Size labelSize = getTextSize(label, FONT_HERSHEY_PLAIN, 1, 1, &baseLine);
+            int yLeftBottom = std::max(currrentBox.y, labelSize.height);
+            rectangle(src, Point(currrentBox.x, yLeftBottom - labelSize.height),
+                      Point(currrentBox.x + labelSize.width, yLeftBottom + baseLine), Scalar( 255, 255, 255 ), FILLED);
+
+            putText(src, label, Point(currrentBox.x, yLeftBottom), FONT_HERSHEY_PLAIN, 1, Scalar( 0,0,0 ), 1, LINE_AA);
+        }
+        else
+            rectangle(src, groups[i], Scalar( 255 ), 3, 8 );
+    }
+}
+
+}
+
+int main(int argc, const char * argv[])
+{
+    if (argc < 2)
+    {
+        printHelpStr(argv[0]);
+        cout << "Insufiecient parameters. Aborting!" << endl;
+        exit(1);
+    }
+
+    const string modelArch = "textbox.prototxt";
+    const string moddelWeights = "TextBoxes_icdar13.caffemodel";
+
+    if (!fileExists(modelArch) || !fileExists(moddelWeights))
+    {
+        printHelpStr(argv[0]);
+        cout << "Model files not found in the current directory. Aborting!" << endl;
+        exit(1);
+    }
+
+    Mat image = imread(String(argv[1]), IMREAD_COLOR);
+
+    cout << "Starting Text Box Demo" << endl;
+    Ptr<text::TextDetectorCNN> textSpotter =
+            text::TextDetectorCNN::create(modelArch, moddelWeights);
+
+    vector<Rect> bbox;
+    vector<float> outProbabillities;
+    textSpotter->detect(image, bbox, outProbabillities);
+    std::vector<int> indexes;
+    cv::dnn::NMSBoxes(bbox, outProbabillities, 0.4f, 0.5f, indexes);
+
+    Mat image_copy = image.clone();
+    textbox_draw(image_copy, bbox, outProbabillities, indexes);
+    imshow("Text detection", image_copy);
+    image_copy = image.clone();
+
+    Ptr<text::OCRHolisticWordRecognizer> wordSpotter =
+            text::OCRHolisticWordRecognizer::create("dictnet_vgg_deploy.prototxt", "dictnet_vgg.caffemodel", "dictnet_vgg_labels.txt");
+
+    for(size_t i = 0; i < indexes.size(); i++)
+    {
+        Mat wordImg;
+        cvtColor(image(bbox[indexes[i]]), wordImg, COLOR_BGR2GRAY);
+        string word;
+        vector<float> confs;
+        wordSpotter->run(wordImg, word, NULL, NULL, &confs);
+
+        Rect currrentBox = bbox[indexes[i]];
+        rectangle(image_copy, currrentBox, Scalar( 0, 255, 255 ), 2, LINE_AA);
+
+        int baseLine = 0;
+        Size labelSize = getTextSize(word, FONT_HERSHEY_PLAIN, 1, 1, &baseLine);
+        int yLeftBottom = std::max(currrentBox.y, labelSize.height);
+        rectangle(image_copy, Point(currrentBox.x, yLeftBottom - labelSize.height),
+                  Point(currrentBox.x + labelSize.width, yLeftBottom + baseLine), Scalar( 255, 255, 255 ), FILLED);
+
+        putText(image_copy, word, Point(currrentBox.x, yLeftBottom), FONT_HERSHEY_PLAIN, 1, Scalar( 0,0,0 ), 1, LINE_AA);
+
+    }
+    imshow("Text recognition", image_copy);
+    cout << "Recognition finished. Press any key to exit.\n";
+    waitKey();
+    return 0;
+}
--- a/modules/text/samples/textbox.prototxt
+++ b/modules/text/samples/textbox.prototxt
--- a/modules/text/samples/textbox_demo.cpp
+++ b/modules/text/samples/textbox_demo.cpp
@ -0,0 +1,96 @@
+#include <opencv2/text.hpp>
+#include <opencv2/highgui.hpp>
+#include <opencv2/imgproc.hpp>
+#include <opencv2/dnn.hpp>
+
+#include  <sstream>
+#include  <iostream>
+#include  <fstream>
+
+using namespace cv;
+
+namespace
+{
+std::string getHelpStr(const std::string& progFname)
+{
+    std::stringstream out;
+    out << "    Demo of text detection CNN for text detection." << std::endl
+        << "    Minghui Liao, Baoguang Shi, Xiang Bai, Xinggang Wang, Wenyu Liu: TextBoxes: A Fast Text Detector with a Single Deep Neural Network, AAAI2017\n\n"
+        << "    Usage: " << progFname << " <output_file> <input_image>" << std::endl
+        << "    Caffe Model files  (textbox.prototxt, TextBoxes_icdar13.caffemodel)"<<std::endl
+        << "      must be in the current directory. See the documentation of text::TextDetectorCNN class to get download links." << std::endl;
+    return out.str();
+}
+
+bool fileExists (const std::string& filename)
+{
+    std::ifstream f(filename.c_str());
+    return f.good();
+}
+
+void textbox_draw(Mat src, std::vector<Rect>& groups, std::vector<float>& probs, std::vector<int>& indexes)
+{
+    for (size_t i = 0; i < indexes.size(); i++)
+    {
+        if (src.type() == CV_8UC3)
+        {
+            Rect currrentBox = groups[indexes[i]];
+            rectangle(src, currrentBox, Scalar( 0, 255, 255 ), 2, LINE_AA);
+            String label = format("%.2f", probs[indexes[i]]);
+            std::cout << "text box: " << currrentBox << " confidence: " << probs[indexes[i]] << "\n";
+
+            int baseLine = 0;
+            Size labelSize = getTextSize(label, FONT_HERSHEY_PLAIN, 1, 1, &baseLine);
+            int yLeftBottom = std::max(currrentBox.y, labelSize.height);
+            rectangle(src, Point(currrentBox.x, yLeftBottom - labelSize.height),
+                      Point(currrentBox.x + labelSize.width, yLeftBottom + baseLine), Scalar( 255, 255, 255 ), FILLED);
+
+            putText(src, label, Point(currrentBox.x, yLeftBottom), FONT_HERSHEY_PLAIN, 1, Scalar( 0,0,0 ), 1, LINE_AA);
+        }
+        else
+            rectangle(src, groups[i], Scalar( 255 ), 3, 8 );
+    }
+}
+
+}
+
+int main(int argc, const char * argv[])
+{
+    if (argc < 2)
+    {
+        std::cout << getHelpStr(argv[0]);
+        std::cout << "Insufiecient parameters. Aborting!" << std::endl;
+        exit(1);
+    }
+
+    const std::string modelArch = "textbox.prototxt";
+    const std::string moddelWeights = "TextBoxes_icdar13.caffemodel";
+
+    if (!fileExists(modelArch) || !fileExists(moddelWeights))
+    {
+        std::cout << getHelpStr(argv[0]);
+        std::cout << "Model files not found in the current directory. Aborting!" << std::endl;
+        exit(1);
+    }
+
+    Mat image = imread(String(argv[1]), IMREAD_COLOR);
+
+    std::cout << "Starting Text Box Demo" << std::endl;
+    Ptr<text::TextDetectorCNN> textSpotter =
+            text::TextDetectorCNN::create(modelArch, moddelWeights);
+
+    std::vector<Rect> bbox;
+    std::vector<float> outProbabillities;
+    textSpotter->detect(image, bbox, outProbabillities);
+
+    std::vector<int> indexes;
+    cv::dnn::NMSBoxes(bbox, outProbabillities, 0.3f, 0.4f, indexes);
+
+    textbox_draw(image, bbox, outProbabillities, indexes);
+
+    imshow("TextBox Demo",image);
+    std::cout << "Done!" << std::endl << std::endl;
+    std::cout << "Press any key to exit." << std::endl << std::endl;
+    waitKey();
+    return 0;
+}
--- a/modules/text/src/ocr_holistic.cpp
+++ b/modules/text/src/ocr_holistic.cpp
@ -1,3 +1,7 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
 #include "precomp.hpp"
 #include "opencv2/imgproc.hpp"
 #include "opencv2/core.hpp"
--- a/modules/text/src/text_detectorCNN.cpp
+++ b/modules/text/src/text_detectorCNN.cpp
@ -0,0 +1,94 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "precomp.hpp"
+#include "opencv2/imgproc.hpp"
+#include "opencv2/core.hpp"
+#include "opencv2/dnn.hpp"
+
+#include <fstream>
+#include <algorithm>
+
+using namespace cv::dnn;
+
+namespace cv
+{
+namespace text
+{
+
+class TextDetectorCNNImpl : public TextDetectorCNN
+{
+protected:
+    Net net_;
+    std::vector<Size> sizes_;
+    int inputChannelCount_;
+
+    void getOutputs(const float* buffer,int nbrTextBoxes,int nCol,
+                               std::vector<Rect>& Bbox, std::vector<float>& confidence, Size inputShape)
+    {
+        for(int k = 0; k < nbrTextBoxes; k++)
+        {
+            float x_min = buffer[k*nCol + 3]*inputShape.width;
+            float y_min = buffer[k*nCol + 4]*inputShape.height;
+
+            float x_max = buffer[k*nCol + 5]*inputShape.width;
+            float y_max = buffer[k*nCol + 6]*inputShape.height;
+
+            CV_Assert(x_min < x_max, y_min < y_max);
+
+            x_min = std::max(0.f, x_min);
+            y_min = std::max(0.f, y_min);
+
+            x_max = std::min(inputShape.width - 1.f,  x_max);
+            y_max = std::min(inputShape.height - 1.f,  y_max);
+
+            int wd = cvRound(x_max - x_min);
+            int ht = cvRound(y_max - y_min);
+
+            Bbox.push_back(Rect(cvRound(x_min), cvRound(y_min), wd, ht));
+            confidence.push_back(buffer[k*nCol + 2]);
+        }
+    }
+
+public:
+    TextDetectorCNNImpl(const String& modelArchFilename, const String& modelWeightsFilename, std::vector<Size> detectionSizes) :
+        sizes_(detectionSizes)
+    {
+        net_ = readNetFromCaffe(modelArchFilename, modelWeightsFilename);
+        CV_Assert(!net_.empty());
+        inputChannelCount_ = 3;
+    }
+
+    void detect(InputArray inputImage_, std::vector<Rect>& Bbox, std::vector<float>& confidence)
+    {
+        CV_Assert(inputImage_.channels() == inputChannelCount_);
+        Mat inputImage = inputImage_.getMat();
+        Bbox.resize(0);
+        confidence.resize(0);
+
+        for(size_t i = 0; i < sizes_.size(); i++)
+        {
+            Size inputGeometry = sizes_[i];
+            net_.setInput(blobFromImage(inputImage, 1, inputGeometry, Scalar(123, 117, 104), false, false), "data");
+            Mat outputNet = net_.forward();
+            int nbrTextBoxes = outputNet.size[2];
+            int nCol = outputNet.size[3];
+            int outputChannelCount = outputNet.size[1];
+            CV_Assert(outputChannelCount == 1);
+            getOutputs((float*)(outputNet.data), nbrTextBoxes, nCol, Bbox, confidence, inputImage.size());
+        }
+     }
+};
+
+Ptr<TextDetectorCNN> TextDetectorCNN::create(const String &modelArchFilename, const String &modelWeightsFilename, std::vector<Size> detectionSizes)
+{
+    return makePtr<TextDetectorCNNImpl>(modelArchFilename, modelWeightsFilename, detectionSizes);
+}
+
+Ptr<TextDetectorCNN> TextDetectorCNN::create(const String &modelArchFilename, const String &modelWeightsFilename)
+{
+    return create(modelArchFilename, modelWeightsFilename, std::vector<Size>(1, Size(300, 300)));
+}
+} //namespace text
+} //namespace cv