Merge pull request #1399 from sovrasov:text_detector_dnn

8 years ago · 6651fb0b45
parent e85a802a90 fd2e37da56
commit 6651fb0b45
13 changed files with 2063 additions and 10 deletions
--- a/modules/text/README.md
+++ b/modules/text/README.md
@ -47,3 +47,12 @@ Notes
 2. Tesseract configure script may fail to detect leptonica, so you may have to edit the configure script - comment off some if's around this message and retain only "then" branch.
 3. You are encouraged to search the Net for some better pre-trained classifiers, as well as classifiers for other languages.
 Text Detection CNN
 =================
 Intro
 -----
 The text module now have a text detection and recognition using deep CNN. The text detector deep CNN that takes an image which may contain multiple words. This outputs a list of Rects with bounding boxes and probability of text there. The text recognizer provides a probabillity over a given vocabulary for each of these rects.
--- a/modules/text/cmake/FindTesseract.cmake
+++ b/modules/text/cmake/FindTesseract.cmake
@ -5,14 +5,17 @@ endif()
 if(NOT Tesseract_FOUND)
  find_path(Tesseract_INCLUDE_DIR tesseract/baseapi.h
    HINTS
    /usr/include
    /usr/local/include)
  find_library(Tesseract_LIBRARY NAMES tesseract
    HINTS
    /usr/lib
    /usr/local/lib)
  find_library(Lept_LIBRARY NAMES lept
    HINTS
    /usr/lib
    /usr/local/lib)
  if(Tesseract_INCLUDE_DIR AND Tesseract_LIBRARY AND Lept_LIBRARY)
--- a/modules/text/doc/text.bib
+++ b/modules/text/doc/text.bib
@ -31,4 +31,14 @@
  journal   = {CoRR},
  volume    = {abs/1407.7504},
  year      = {2014},
-}
+}
@inproceedings{LiaoSBWL17,
  author    = {Minghui Liao and
               Baoguang Shi and
               Xiang Bai and
               Xinggang Wang and
               Wenyu Liu},
  title     = {TextBoxes: {A} Fast Text Detector with a Single Deep Neural Network},
  booktitle = {AAAI},
  year      = {2017}
 }
--- a/modules/text/include/opencv2/text.hpp
+++ b/modules/text/include/opencv2/text.hpp
@ -41,6 +41,7 @@ the use of this software, even if advised of the possibility of such damage.
 #include "opencv2/text/erfilter.hpp"
 #include "opencv2/text/ocr.hpp"
 #include "opencv2/text/textDetector.hpp"
 /** @defgroup text Scene Text Detection and Recognition
--- a/modules/text/include/opencv2/text/ocr.hpp
+++ b/modules/text/include/opencv2/text/ocr.hpp
@ -44,6 +44,8 @@
 #ifndef __OPENCV_TEXT_OCR_HPP__
 #define __OPENCV_TEXT_OCR_HPP__
 #include <opencv2/core.hpp>
 #include <vector>
 #include <string>
--- a/modules/text/include/opencv2/text/textDetector.hpp
+++ b/modules/text/include/opencv2/text/textDetector.hpp
@ -0,0 +1,73 @@
 // This file is part of OpenCV project.
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 #ifndef __OPENCV_TEXT_TEXTDETECTOR_HPP__
 #define __OPENCV_TEXT_TEXTDETECTOR_HPP__
 #include"ocr.hpp"
 namespace cv
 {
 namespace text
 {
 //! @addtogroup text_detect
 //! @{
 /** @brief An abstract class providing interface for text detection algorithms
 */
 class CV_EXPORTS_W TextDetector
 {
 public:
    /**
    @brief Method that provides a quick and simple interface to detect text inside an image
    @param inputImage an image to process
    @param Bbox a vector of Rect that will store the detected word bounding box
    @param confidence a vector of float that will be updated with the confidence the classifier has for the selected bounding box
    */
    CV_WRAP virtual void detect(InputArray inputImage, CV_OUT std::vector<Rect>& Bbox, CV_OUT std::vector<float>& confidence) = 0;
    virtual ~TextDetector() {}
 };
 /** @brief TextDetectorCNN class provides the functionallity of text bounding box detection.
 This class is representing to find bounding boxes of text words given an input image.
 This class uses OpenCV dnn module to load pre-trained model described in @cite LiaoSBWL17.
 The original repository with the modified SSD Caffe version: https://github.com/MhLiao/TextBoxes.
 Model can be downloaded from [DropBox](https://www.dropbox.com/s/g8pjzv2de9gty8g/TextBoxes_icdar13.caffemodel?dl=0).
 Modified .prototxt file with the model description can be found in `opencv_contrib/modules/text/samples/textbox.prototxt`.
 */
 class CV_EXPORTS_W TextDetectorCNN : public TextDetector
 {
 public:
    /**
    @overload
    @param inputImage an image expected to be a CV_U8C3 of any size
    @param Bbox a vector of Rect that will store the detected word bounding box
    @param confidence a vector of float that will be updated with the confidence the classifier has for the selected bounding box
    */
    CV_WRAP virtual void detect(InputArray inputImage, CV_OUT std::vector<Rect>& Bbox, CV_OUT std::vector<float>& confidence) = 0;
    /** @brief Creates an instance of the TextDetectorCNN class using the provided parameters.
    @param modelArchFilename the relative or absolute path to the prototxt file describing the classifiers architecture.
    @param modelWeightsFilename the relative or absolute path to the file containing the pretrained weights of the model in caffe-binary form.
    @param detectionSizes a list of sizes for multiscale detection. The values`[(300,300),(700,500),(700,300),(700,700),(1600,1600)]` are
    recommended in @cite LiaoSBWL17 to achieve the best quality.
    */
    static Ptr<TextDetectorCNN> create(const String& modelArchFilename, const String& modelWeightsFilename,
                                               std::vector<Size> detectionSizes);
    /**
      @overload
    */
    CV_WRAP static Ptr<TextDetectorCNN> create(const String& modelArchFilename, const String& modelWeightsFilename);
 };
 //! @}
 }//namespace text
 }//namespace cv
 #endif // _OPENCV_TEXT_OCR_HPP_
--- a/modules/text/samples/deeptextdetection.py
+++ b/modules/text/samples/deeptextdetection.py
@ -0,0 +1,37 @@
 # -*- coding: utf-8 -*-
 #!/usr/bin/python
 import sys
 import os
 import cv2
 import numpy as np
 def main():
    print('\nDeeptextdetection.py')
    print('       A demo script of text box alogorithm of the paper:')
    print('       * Minghui Liao et al.: TextBoxes: A Fast Text Detector with a Single Deep Neural Network https://arxiv.org/abs/1611.06779\n')
    if (len(sys.argv) < 2):
        print(' (ERROR) You must call this script with an argument (path_to_image_to_be_processed)\n')
        quit()
    if not os.path.isfile('TextBoxes_icdar13.caffemodel') or not os.path.isfile('textbox.prototxt'):
        print " Model files not found in current directory. Aborting"
        print " See the documentation of text::TextDetectorCNN class to get download links."
        quit()
    img = cv2.imread(str(sys.argv[1]))
    textSpotter = cv2.text.TextDetectorCNN_create("textbox.prototxt", "TextBoxes_icdar13.caffemodel")
    rects, outProbs = textSpotter.detect(img);
    vis = img.copy()
    thres = 0.6
    for r in range(np.shape(rects)[0]):
        if outProbs[r] > thres:
            rect = rects[r]
            cv2.rectangle(vis, (rect[0],rect[1]), (rect[0] + rect[2], rect[1] + rect[3]), (255, 0, 0), 2)
    cv2.imshow("Text detection result", vis)
    cv2.waitKey()
 if __name__ == "__main__":
    main()
--- a/modules/text/samples/dictnet_demo.cpp
+++ b/modules/text/samples/dictnet_demo.cpp
@ -1,12 +1,3 @@
 /*
 * dictnet_demo.cpp
 *
 * Demonstrates simple use of the holistic word classifier in C++
 *
 * Created on: June 26, 2016
 *     Author: Anguelos Nicolaou <anguelos.nicolaou AT gmail.com>
 */
 #include  "opencv2/text.hpp"
 #include  "opencv2/highgui.hpp"
 #include  "opencv2/imgproc.hpp"
--- a/modules/text/samples/text_recognition_cnn.cpp
+++ b/modules/text/samples/text_recognition_cnn.cpp
@ -0,0 +1,122 @@
 #include <opencv2/text.hpp>
 #include <opencv2/highgui.hpp>
 #include <opencv2/imgproc.hpp>
 #include <opencv2/dnn.hpp>
 #include  <iostream>
 #include  <fstream>
 using namespace cv;
 using namespace std;
 namespace
 {
 void printHelpStr(const string& progFname)
 {
    cout << "   Demo of text recognition CNN for text detection." << endl
         << "   Max Jaderberg et al.: Reading Text in the Wild with Convolutional Neural Networks, IJCV 2015"<<endl<<endl
         << "   Usage: " << progFname << " <output_file> <input_image>" << endl
         << "   Caffe Model files (textbox.prototxt, TextBoxes_icdar13.caffemodel)"<<endl
         << "     must be in the current directory. See the documentation of text::TextDetectorCNN class to get download links." << endl
         << "   Obtaining text recognition Caffe Model files in linux shell:" << endl
         << "   wget http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg.caffemodel" << endl
         << "   wget http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg_deploy.prototxt" << endl
         << "   wget http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg_labels.txt" <<endl << endl;
 }
 bool fileExists (const string& filename)
 {
    ifstream f(filename.c_str());
    return f.good();
 }
 void textbox_draw(Mat src, std::vector<Rect>& groups, std::vector<float>& probs, std::vector<int>& indexes)
 {
    for (size_t i = 0; i < indexes.size(); i++)
    {
        if (src.type() == CV_8UC3)
        {
            Rect currrentBox = groups[indexes[i]];
            rectangle(src, currrentBox, Scalar( 0, 255, 255 ), 2, LINE_AA);
            String label = format("%.2f", probs[indexes[i]]);
            std::cout << "text box: " << currrentBox << " confidence: " << probs[indexes[i]] << "\n";
            int baseLine = 0;
            Size labelSize = getTextSize(label, FONT_HERSHEY_PLAIN, 1, 1, &baseLine);
            int yLeftBottom = std::max(currrentBox.y, labelSize.height);
            rectangle(src, Point(currrentBox.x, yLeftBottom - labelSize.height),
                      Point(currrentBox.x + labelSize.width, yLeftBottom + baseLine), Scalar( 255, 255, 255 ), FILLED);
            putText(src, label, Point(currrentBox.x, yLeftBottom), FONT_HERSHEY_PLAIN, 1, Scalar( 0,0,0 ), 1, LINE_AA);
        }
        else
            rectangle(src, groups[i], Scalar( 255 ), 3, 8 );
    }
 }
 }
 int main(int argc, const char * argv[])
 {
    if (argc < 2)
    {
        printHelpStr(argv[0]);
        cout << "Insufiecient parameters. Aborting!" << endl;
        exit(1);
    }
    const string modelArch = "textbox.prototxt";
    const string moddelWeights = "TextBoxes_icdar13.caffemodel";
    if (!fileExists(modelArch) || !fileExists(moddelWeights))
    {
        printHelpStr(argv[0]);
        cout << "Model files not found in the current directory. Aborting!" << endl;
        exit(1);
    }
    Mat image = imread(String(argv[1]), IMREAD_COLOR);
    cout << "Starting Text Box Demo" << endl;
    Ptr<text::TextDetectorCNN> textSpotter =
            text::TextDetectorCNN::create(modelArch, moddelWeights);
    vector<Rect> bbox;
    vector<float> outProbabillities;
    textSpotter->detect(image, bbox, outProbabillities);
    std::vector<int> indexes;
    cv::dnn::NMSBoxes(bbox, outProbabillities, 0.4f, 0.5f, indexes);
    Mat image_copy = image.clone();
    textbox_draw(image_copy, bbox, outProbabillities, indexes);
    imshow("Text detection", image_copy);
    image_copy = image.clone();
    Ptr<text::OCRHolisticWordRecognizer> wordSpotter =
            text::OCRHolisticWordRecognizer::create("dictnet_vgg_deploy.prototxt", "dictnet_vgg.caffemodel", "dictnet_vgg_labels.txt");
    for(size_t i = 0; i < indexes.size(); i++)
    {
        Mat wordImg;
        cvtColor(image(bbox[indexes[i]]), wordImg, COLOR_BGR2GRAY);
        string word;
        vector<float> confs;
        wordSpotter->run(wordImg, word, NULL, NULL, &confs);
        Rect currrentBox = bbox[indexes[i]];
        rectangle(image_copy, currrentBox, Scalar( 0, 255, 255 ), 2, LINE_AA);
        int baseLine = 0;
        Size labelSize = getTextSize(word, FONT_HERSHEY_PLAIN, 1, 1, &baseLine);
        int yLeftBottom = std::max(currrentBox.y, labelSize.height);
        rectangle(image_copy, Point(currrentBox.x, yLeftBottom - labelSize.height),
                  Point(currrentBox.x + labelSize.width, yLeftBottom + baseLine), Scalar( 255, 255, 255 ), FILLED);
        putText(image_copy, word, Point(currrentBox.x, yLeftBottom), FONT_HERSHEY_PLAIN, 1, Scalar( 0,0,0 ), 1, LINE_AA);
    }
    imshow("Text recognition", image_copy);
    cout << "Recognition finished. Press any key to exit.\n";
    waitKey();
    return 0;
 }
--- a/modules/text/samples/textbox.prototxt
+++ b/modules/text/samples/textbox.prototxt
--- a/modules/text/samples/textbox_demo.cpp
+++ b/modules/text/samples/textbox_demo.cpp
@ -0,0 +1,96 @@
 #include <opencv2/text.hpp>
 #include <opencv2/highgui.hpp>
 #include <opencv2/imgproc.hpp>
 #include <opencv2/dnn.hpp>
 #include  <sstream>
 #include  <iostream>
 #include  <fstream>
 using namespace cv;
 namespace
 {
 std::string getHelpStr(const std::string& progFname)
 {
    std::stringstream out;
    out << "    Demo of text detection CNN for text detection." << std::endl
        << "    Minghui Liao, Baoguang Shi, Xiang Bai, Xinggang Wang, Wenyu Liu: TextBoxes: A Fast Text Detector with a Single Deep Neural Network, AAAI2017\n\n"
        << "    Usage: " << progFname << " <output_file> <input_image>" << std::endl
        << "    Caffe Model files  (textbox.prototxt, TextBoxes_icdar13.caffemodel)"<<std::endl
        << "      must be in the current directory. See the documentation of text::TextDetectorCNN class to get download links." << std::endl;
    return out.str();
 }
 bool fileExists (const std::string& filename)
 {
    std::ifstream f(filename.c_str());
    return f.good();
 }
 void textbox_draw(Mat src, std::vector<Rect>& groups, std::vector<float>& probs, std::vector<int>& indexes)
 {
    for (size_t i = 0; i < indexes.size(); i++)
    {
        if (src.type() == CV_8UC3)
        {
            Rect currrentBox = groups[indexes[i]];
            rectangle(src, currrentBox, Scalar( 0, 255, 255 ), 2, LINE_AA);
            String label = format("%.2f", probs[indexes[i]]);
            std::cout << "text box: " << currrentBox << " confidence: " << probs[indexes[i]] << "\n";
            int baseLine = 0;
            Size labelSize = getTextSize(label, FONT_HERSHEY_PLAIN, 1, 1, &baseLine);
            int yLeftBottom = std::max(currrentBox.y, labelSize.height);
            rectangle(src, Point(currrentBox.x, yLeftBottom - labelSize.height),
                      Point(currrentBox.x + labelSize.width, yLeftBottom + baseLine), Scalar( 255, 255, 255 ), FILLED);
            putText(src, label, Point(currrentBox.x, yLeftBottom), FONT_HERSHEY_PLAIN, 1, Scalar( 0,0,0 ), 1, LINE_AA);
        }
        else
            rectangle(src, groups[i], Scalar( 255 ), 3, 8 );
    }
 }
 }
 int main(int argc, const char * argv[])
 {
    if (argc < 2)
    {
        std::cout << getHelpStr(argv[0]);
        std::cout << "Insufiecient parameters. Aborting!" << std::endl;
        exit(1);
    }
    const std::string modelArch = "textbox.prototxt";
    const std::string moddelWeights = "TextBoxes_icdar13.caffemodel";
    if (!fileExists(modelArch) || !fileExists(moddelWeights))
    {
        std::cout << getHelpStr(argv[0]);
        std::cout << "Model files not found in the current directory. Aborting!" << std::endl;
        exit(1);
    }
    Mat image = imread(String(argv[1]), IMREAD_COLOR);
    std::cout << "Starting Text Box Demo" << std::endl;
    Ptr<text::TextDetectorCNN> textSpotter =
            text::TextDetectorCNN::create(modelArch, moddelWeights);
    std::vector<Rect> bbox;
    std::vector<float> outProbabillities;
    textSpotter->detect(image, bbox, outProbabillities);
    std::vector<int> indexes;
    cv::dnn::NMSBoxes(bbox, outProbabillities, 0.3f, 0.4f, indexes);
    textbox_draw(image, bbox, outProbabillities, indexes);
    imshow("TextBox Demo",image);
    std::cout << "Done!" << std::endl << std::endl;
    std::cout << "Press any key to exit." << std::endl << std::endl;
    waitKey();
    return 0;
 }
--- a/modules/text/src/ocr_holistic.cpp
+++ b/modules/text/src/ocr_holistic.cpp
@ -1,3 +1,7 @@
 // This file is part of OpenCV project.
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 #include "precomp.hpp"
 #include "opencv2/imgproc.hpp"
 #include "opencv2/core.hpp"
--- a/modules/text/src/text_detectorCNN.cpp
+++ b/modules/text/src/text_detectorCNN.cpp
@ -0,0 +1,94 @@
 // This file is part of OpenCV project.
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 #include "precomp.hpp"
 #include "opencv2/imgproc.hpp"
 #include "opencv2/core.hpp"
 #include "opencv2/dnn.hpp"
 #include <fstream>
 #include <algorithm>
 using namespace cv::dnn;
 namespace cv
 {
 namespace text
 {
 class TextDetectorCNNImpl : public TextDetectorCNN
 {
 protected:
    Net net_;
    std::vector<Size> sizes_;
    int inputChannelCount_;
    void getOutputs(const float* buffer,int nbrTextBoxes,int nCol,
                               std::vector<Rect>& Bbox, std::vector<float>& confidence, Size inputShape)
    {
        for(int k = 0; k < nbrTextBoxes; k++)
        {
            float x_min = buffer[k*nCol + 3]*inputShape.width;
            float y_min = buffer[k*nCol + 4]*inputShape.height;
            float x_max = buffer[k*nCol + 5]*inputShape.width;
            float y_max = buffer[k*nCol + 6]*inputShape.height;
            CV_Assert(x_min < x_max, y_min < y_max);
            x_min = std::max(0.f, x_min);
            y_min = std::max(0.f, y_min);
            x_max = std::min(inputShape.width - 1.f,  x_max);
            y_max = std::min(inputShape.height - 1.f,  y_max);
            int wd = cvRound(x_max - x_min);
            int ht = cvRound(y_max - y_min);
            Bbox.push_back(Rect(cvRound(x_min), cvRound(y_min), wd, ht));
            confidence.push_back(buffer[k*nCol + 2]);
        }
    }
 public:
    TextDetectorCNNImpl(const String& modelArchFilename, const String& modelWeightsFilename, std::vector<Size> detectionSizes) :
        sizes_(detectionSizes)
    {
        net_ = readNetFromCaffe(modelArchFilename, modelWeightsFilename);
        CV_Assert(!net_.empty());
        inputChannelCount_ = 3;
    }
    void detect(InputArray inputImage_, std::vector<Rect>& Bbox, std::vector<float>& confidence)
    {
        CV_Assert(inputImage_.channels() == inputChannelCount_);
        Mat inputImage = inputImage_.getMat();
        Bbox.resize(0);
        confidence.resize(0);
        for(size_t i = 0; i < sizes_.size(); i++)
        {
            Size inputGeometry = sizes_[i];
            net_.setInput(blobFromImage(inputImage, 1, inputGeometry, Scalar(123, 117, 104), false, false), "data");
            Mat outputNet = net_.forward();
            int nbrTextBoxes = outputNet.size[2];
            int nCol = outputNet.size[3];
            int outputChannelCount = outputNet.size[1];
            CV_Assert(outputChannelCount == 1);
            getOutputs((float*)(outputNet.data), nbrTextBoxes, nCol, Bbox, confidence, inputImage.size());
        }
     }
 };
 Ptr<TextDetectorCNN> TextDetectorCNN::create(const String &modelArchFilename, const String &modelWeightsFilename, std::vector<Size> detectionSizes)
 {
    return makePtr<TextDetectorCNNImpl>(modelArchFilename, modelWeightsFilename, detectionSizes);
 }
 Ptr<TextDetectorCNN> TextDetectorCNN::create(const String &modelArchFilename, const String &modelWeightsFilename)
 {
    return create(modelArchFilename, modelWeightsFilename, std::vector<Size>(1, Size(300, 300)));
 }
 } //namespace text
 } //namespace cv