Merge pull request #1399 from sovrasov:text_detector_dnn
commit
6651fb0b45
13 changed files with 2063 additions and 10 deletions
@ -0,0 +1,73 @@ |
|||||||
|
// This file is part of OpenCV project.
|
||||||
|
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||||
|
// of this distribution and at http://opencv.org/license.html.
|
||||||
|
|
||||||
|
#ifndef __OPENCV_TEXT_TEXTDETECTOR_HPP__ |
||||||
|
#define __OPENCV_TEXT_TEXTDETECTOR_HPP__ |
||||||
|
|
||||||
|
#include"ocr.hpp" |
||||||
|
|
||||||
|
namespace cv |
||||||
|
{ |
||||||
|
namespace text |
||||||
|
{ |
||||||
|
|
||||||
|
//! @addtogroup text_detect
|
||||||
|
//! @{
|
||||||
|
|
||||||
|
/** @brief An abstract class providing interface for text detection algorithms
|
||||||
|
*/ |
||||||
|
class CV_EXPORTS_W TextDetector |
||||||
|
{ |
||||||
|
public: |
||||||
|
/**
|
||||||
|
@brief Method that provides a quick and simple interface to detect text inside an image |
||||||
|
|
||||||
|
@param inputImage an image to process |
||||||
|
@param Bbox a vector of Rect that will store the detected word bounding box |
||||||
|
@param confidence a vector of float that will be updated with the confidence the classifier has for the selected bounding box |
||||||
|
*/ |
||||||
|
CV_WRAP virtual void detect(InputArray inputImage, CV_OUT std::vector<Rect>& Bbox, CV_OUT std::vector<float>& confidence) = 0; |
||||||
|
virtual ~TextDetector() {} |
||||||
|
}; |
||||||
|
|
||||||
|
/** @brief TextDetectorCNN class provides the functionallity of text bounding box detection.
|
||||||
|
This class is representing to find bounding boxes of text words given an input image. |
||||||
|
This class uses OpenCV dnn module to load pre-trained model described in @cite LiaoSBWL17. |
||||||
|
The original repository with the modified SSD Caffe version: https://github.com/MhLiao/TextBoxes.
|
||||||
|
Model can be downloaded from [DropBox](https://www.dropbox.com/s/g8pjzv2de9gty8g/TextBoxes_icdar13.caffemodel?dl=0).
|
||||||
|
Modified .prototxt file with the model description can be found in `opencv_contrib/modules/text/samples/textbox.prototxt`. |
||||||
|
*/ |
||||||
|
class CV_EXPORTS_W TextDetectorCNN : public TextDetector |
||||||
|
{ |
||||||
|
public: |
||||||
|
/**
|
||||||
|
@overload |
||||||
|
|
||||||
|
@param inputImage an image expected to be a CV_U8C3 of any size |
||||||
|
@param Bbox a vector of Rect that will store the detected word bounding box |
||||||
|
@param confidence a vector of float that will be updated with the confidence the classifier has for the selected bounding box |
||||||
|
*/ |
||||||
|
CV_WRAP virtual void detect(InputArray inputImage, CV_OUT std::vector<Rect>& Bbox, CV_OUT std::vector<float>& confidence) = 0; |
||||||
|
|
||||||
|
/** @brief Creates an instance of the TextDetectorCNN class using the provided parameters.
|
||||||
|
|
||||||
|
@param modelArchFilename the relative or absolute path to the prototxt file describing the classifiers architecture. |
||||||
|
@param modelWeightsFilename the relative or absolute path to the file containing the pretrained weights of the model in caffe-binary form. |
||||||
|
@param detectionSizes a list of sizes for multiscale detection. The values`[(300,300),(700,500),(700,300),(700,700),(1600,1600)]` are |
||||||
|
recommended in @cite LiaoSBWL17 to achieve the best quality. |
||||||
|
*/ |
||||||
|
static Ptr<TextDetectorCNN> create(const String& modelArchFilename, const String& modelWeightsFilename, |
||||||
|
std::vector<Size> detectionSizes); |
||||||
|
/**
|
||||||
|
@overload |
||||||
|
*/ |
||||||
|
CV_WRAP static Ptr<TextDetectorCNN> create(const String& modelArchFilename, const String& modelWeightsFilename); |
||||||
|
}; |
||||||
|
|
||||||
|
//! @}
|
||||||
|
}//namespace text
|
||||||
|
}//namespace cv
|
||||||
|
|
||||||
|
|
||||||
|
#endif // _OPENCV_TEXT_OCR_HPP_
|
@ -0,0 +1,37 @@ |
|||||||
|
# -*- coding: utf-8 -*- |
||||||
|
#!/usr/bin/python |
||||||
|
import sys |
||||||
|
import os |
||||||
|
import cv2 |
||||||
|
import numpy as np |
||||||
|
|
||||||
|
def main(): |
||||||
|
print('\nDeeptextdetection.py') |
||||||
|
print(' A demo script of text box alogorithm of the paper:') |
||||||
|
print(' * Minghui Liao et al.: TextBoxes: A Fast Text Detector with a Single Deep Neural Network https://arxiv.org/abs/1611.06779\n') |
||||||
|
|
||||||
|
if (len(sys.argv) < 2): |
||||||
|
print(' (ERROR) You must call this script with an argument (path_to_image_to_be_processed)\n') |
||||||
|
quit() |
||||||
|
|
||||||
|
if not os.path.isfile('TextBoxes_icdar13.caffemodel') or not os.path.isfile('textbox.prototxt'): |
||||||
|
print " Model files not found in current directory. Aborting" |
||||||
|
print " See the documentation of text::TextDetectorCNN class to get download links." |
||||||
|
quit() |
||||||
|
|
||||||
|
img = cv2.imread(str(sys.argv[1])) |
||||||
|
textSpotter = cv2.text.TextDetectorCNN_create("textbox.prototxt", "TextBoxes_icdar13.caffemodel") |
||||||
|
rects, outProbs = textSpotter.detect(img); |
||||||
|
vis = img.copy() |
||||||
|
thres = 0.6 |
||||||
|
|
||||||
|
for r in range(np.shape(rects)[0]): |
||||||
|
if outProbs[r] > thres: |
||||||
|
rect = rects[r] |
||||||
|
cv2.rectangle(vis, (rect[0],rect[1]), (rect[0] + rect[2], rect[1] + rect[3]), (255, 0, 0), 2) |
||||||
|
|
||||||
|
cv2.imshow("Text detection result", vis) |
||||||
|
cv2.waitKey() |
||||||
|
|
||||||
|
if __name__ == "__main__": |
||||||
|
main() |
@ -0,0 +1,122 @@ |
|||||||
|
#include <opencv2/text.hpp> |
||||||
|
#include <opencv2/highgui.hpp> |
||||||
|
#include <opencv2/imgproc.hpp> |
||||||
|
#include <opencv2/dnn.hpp> |
||||||
|
|
||||||
|
#include <iostream> |
||||||
|
#include <fstream> |
||||||
|
|
||||||
|
using namespace cv; |
||||||
|
using namespace std; |
||||||
|
|
||||||
|
namespace |
||||||
|
{ |
||||||
|
void printHelpStr(const string& progFname) |
||||||
|
{ |
||||||
|
cout << " Demo of text recognition CNN for text detection." << endl |
||||||
|
<< " Max Jaderberg et al.: Reading Text in the Wild with Convolutional Neural Networks, IJCV 2015"<<endl<<endl |
||||||
|
<< " Usage: " << progFname << " <output_file> <input_image>" << endl |
||||||
|
<< " Caffe Model files (textbox.prototxt, TextBoxes_icdar13.caffemodel)"<<endl |
||||||
|
<< " must be in the current directory. See the documentation of text::TextDetectorCNN class to get download links." << endl |
||||||
|
<< " Obtaining text recognition Caffe Model files in linux shell:" << endl |
||||||
|
<< " wget http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg.caffemodel" << endl |
||||||
|
<< " wget http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg_deploy.prototxt" << endl |
||||||
|
<< " wget http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg_labels.txt" <<endl << endl; |
||||||
|
} |
||||||
|
|
||||||
|
bool fileExists (const string& filename) |
||||||
|
{ |
||||||
|
ifstream f(filename.c_str()); |
||||||
|
return f.good(); |
||||||
|
} |
||||||
|
|
||||||
|
void textbox_draw(Mat src, std::vector<Rect>& groups, std::vector<float>& probs, std::vector<int>& indexes) |
||||||
|
{ |
||||||
|
for (size_t i = 0; i < indexes.size(); i++) |
||||||
|
{ |
||||||
|
if (src.type() == CV_8UC3) |
||||||
|
{ |
||||||
|
Rect currrentBox = groups[indexes[i]]; |
||||||
|
rectangle(src, currrentBox, Scalar( 0, 255, 255 ), 2, LINE_AA); |
||||||
|
String label = format("%.2f", probs[indexes[i]]); |
||||||
|
std::cout << "text box: " << currrentBox << " confidence: " << probs[indexes[i]] << "\n"; |
||||||
|
|
||||||
|
int baseLine = 0; |
||||||
|
Size labelSize = getTextSize(label, FONT_HERSHEY_PLAIN, 1, 1, &baseLine); |
||||||
|
int yLeftBottom = std::max(currrentBox.y, labelSize.height); |
||||||
|
rectangle(src, Point(currrentBox.x, yLeftBottom - labelSize.height), |
||||||
|
Point(currrentBox.x + labelSize.width, yLeftBottom + baseLine), Scalar( 255, 255, 255 ), FILLED); |
||||||
|
|
||||||
|
putText(src, label, Point(currrentBox.x, yLeftBottom), FONT_HERSHEY_PLAIN, 1, Scalar( 0,0,0 ), 1, LINE_AA); |
||||||
|
} |
||||||
|
else |
||||||
|
rectangle(src, groups[i], Scalar( 255 ), 3, 8 ); |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
} |
||||||
|
|
||||||
|
int main(int argc, const char * argv[]) |
||||||
|
{ |
||||||
|
if (argc < 2) |
||||||
|
{ |
||||||
|
printHelpStr(argv[0]); |
||||||
|
cout << "Insufiecient parameters. Aborting!" << endl; |
||||||
|
exit(1); |
||||||
|
} |
||||||
|
|
||||||
|
const string modelArch = "textbox.prototxt"; |
||||||
|
const string moddelWeights = "TextBoxes_icdar13.caffemodel"; |
||||||
|
|
||||||
|
if (!fileExists(modelArch) || !fileExists(moddelWeights)) |
||||||
|
{ |
||||||
|
printHelpStr(argv[0]); |
||||||
|
cout << "Model files not found in the current directory. Aborting!" << endl; |
||||||
|
exit(1); |
||||||
|
} |
||||||
|
|
||||||
|
Mat image = imread(String(argv[1]), IMREAD_COLOR); |
||||||
|
|
||||||
|
cout << "Starting Text Box Demo" << endl; |
||||||
|
Ptr<text::TextDetectorCNN> textSpotter = |
||||||
|
text::TextDetectorCNN::create(modelArch, moddelWeights); |
||||||
|
|
||||||
|
vector<Rect> bbox; |
||||||
|
vector<float> outProbabillities; |
||||||
|
textSpotter->detect(image, bbox, outProbabillities); |
||||||
|
std::vector<int> indexes; |
||||||
|
cv::dnn::NMSBoxes(bbox, outProbabillities, 0.4f, 0.5f, indexes); |
||||||
|
|
||||||
|
Mat image_copy = image.clone(); |
||||||
|
textbox_draw(image_copy, bbox, outProbabillities, indexes); |
||||||
|
imshow("Text detection", image_copy); |
||||||
|
image_copy = image.clone(); |
||||||
|
|
||||||
|
Ptr<text::OCRHolisticWordRecognizer> wordSpotter = |
||||||
|
text::OCRHolisticWordRecognizer::create("dictnet_vgg_deploy.prototxt", "dictnet_vgg.caffemodel", "dictnet_vgg_labels.txt"); |
||||||
|
|
||||||
|
for(size_t i = 0; i < indexes.size(); i++) |
||||||
|
{ |
||||||
|
Mat wordImg; |
||||||
|
cvtColor(image(bbox[indexes[i]]), wordImg, COLOR_BGR2GRAY); |
||||||
|
string word; |
||||||
|
vector<float> confs; |
||||||
|
wordSpotter->run(wordImg, word, NULL, NULL, &confs); |
||||||
|
|
||||||
|
Rect currrentBox = bbox[indexes[i]]; |
||||||
|
rectangle(image_copy, currrentBox, Scalar( 0, 255, 255 ), 2, LINE_AA); |
||||||
|
|
||||||
|
int baseLine = 0; |
||||||
|
Size labelSize = getTextSize(word, FONT_HERSHEY_PLAIN, 1, 1, &baseLine); |
||||||
|
int yLeftBottom = std::max(currrentBox.y, labelSize.height); |
||||||
|
rectangle(image_copy, Point(currrentBox.x, yLeftBottom - labelSize.height), |
||||||
|
Point(currrentBox.x + labelSize.width, yLeftBottom + baseLine), Scalar( 255, 255, 255 ), FILLED); |
||||||
|
|
||||||
|
putText(image_copy, word, Point(currrentBox.x, yLeftBottom), FONT_HERSHEY_PLAIN, 1, Scalar( 0,0,0 ), 1, LINE_AA); |
||||||
|
|
||||||
|
} |
||||||
|
imshow("Text recognition", image_copy); |
||||||
|
cout << "Recognition finished. Press any key to exit.\n"; |
||||||
|
waitKey(); |
||||||
|
return 0; |
||||||
|
} |
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,96 @@ |
|||||||
|
#include <opencv2/text.hpp> |
||||||
|
#include <opencv2/highgui.hpp> |
||||||
|
#include <opencv2/imgproc.hpp> |
||||||
|
#include <opencv2/dnn.hpp> |
||||||
|
|
||||||
|
#include <sstream> |
||||||
|
#include <iostream> |
||||||
|
#include <fstream> |
||||||
|
|
||||||
|
using namespace cv; |
||||||
|
|
||||||
|
namespace |
||||||
|
{ |
||||||
|
std::string getHelpStr(const std::string& progFname) |
||||||
|
{ |
||||||
|
std::stringstream out; |
||||||
|
out << " Demo of text detection CNN for text detection." << std::endl |
||||||
|
<< " Minghui Liao, Baoguang Shi, Xiang Bai, Xinggang Wang, Wenyu Liu: TextBoxes: A Fast Text Detector with a Single Deep Neural Network, AAAI2017\n\n" |
||||||
|
<< " Usage: " << progFname << " <output_file> <input_image>" << std::endl |
||||||
|
<< " Caffe Model files (textbox.prototxt, TextBoxes_icdar13.caffemodel)"<<std::endl |
||||||
|
<< " must be in the current directory. See the documentation of text::TextDetectorCNN class to get download links." << std::endl; |
||||||
|
return out.str(); |
||||||
|
} |
||||||
|
|
||||||
|
bool fileExists (const std::string& filename) |
||||||
|
{ |
||||||
|
std::ifstream f(filename.c_str()); |
||||||
|
return f.good(); |
||||||
|
} |
||||||
|
|
||||||
|
void textbox_draw(Mat src, std::vector<Rect>& groups, std::vector<float>& probs, std::vector<int>& indexes) |
||||||
|
{ |
||||||
|
for (size_t i = 0; i < indexes.size(); i++) |
||||||
|
{ |
||||||
|
if (src.type() == CV_8UC3) |
||||||
|
{ |
||||||
|
Rect currrentBox = groups[indexes[i]]; |
||||||
|
rectangle(src, currrentBox, Scalar( 0, 255, 255 ), 2, LINE_AA); |
||||||
|
String label = format("%.2f", probs[indexes[i]]); |
||||||
|
std::cout << "text box: " << currrentBox << " confidence: " << probs[indexes[i]] << "\n"; |
||||||
|
|
||||||
|
int baseLine = 0; |
||||||
|
Size labelSize = getTextSize(label, FONT_HERSHEY_PLAIN, 1, 1, &baseLine); |
||||||
|
int yLeftBottom = std::max(currrentBox.y, labelSize.height); |
||||||
|
rectangle(src, Point(currrentBox.x, yLeftBottom - labelSize.height), |
||||||
|
Point(currrentBox.x + labelSize.width, yLeftBottom + baseLine), Scalar( 255, 255, 255 ), FILLED); |
||||||
|
|
||||||
|
putText(src, label, Point(currrentBox.x, yLeftBottom), FONT_HERSHEY_PLAIN, 1, Scalar( 0,0,0 ), 1, LINE_AA); |
||||||
|
} |
||||||
|
else |
||||||
|
rectangle(src, groups[i], Scalar( 255 ), 3, 8 ); |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
} |
||||||
|
|
||||||
|
int main(int argc, const char * argv[]) |
||||||
|
{ |
||||||
|
if (argc < 2) |
||||||
|
{ |
||||||
|
std::cout << getHelpStr(argv[0]); |
||||||
|
std::cout << "Insufiecient parameters. Aborting!" << std::endl; |
||||||
|
exit(1); |
||||||
|
} |
||||||
|
|
||||||
|
const std::string modelArch = "textbox.prototxt"; |
||||||
|
const std::string moddelWeights = "TextBoxes_icdar13.caffemodel"; |
||||||
|
|
||||||
|
if (!fileExists(modelArch) || !fileExists(moddelWeights)) |
||||||
|
{ |
||||||
|
std::cout << getHelpStr(argv[0]); |
||||||
|
std::cout << "Model files not found in the current directory. Aborting!" << std::endl; |
||||||
|
exit(1); |
||||||
|
} |
||||||
|
|
||||||
|
Mat image = imread(String(argv[1]), IMREAD_COLOR); |
||||||
|
|
||||||
|
std::cout << "Starting Text Box Demo" << std::endl; |
||||||
|
Ptr<text::TextDetectorCNN> textSpotter = |
||||||
|
text::TextDetectorCNN::create(modelArch, moddelWeights); |
||||||
|
|
||||||
|
std::vector<Rect> bbox; |
||||||
|
std::vector<float> outProbabillities; |
||||||
|
textSpotter->detect(image, bbox, outProbabillities); |
||||||
|
|
||||||
|
std::vector<int> indexes; |
||||||
|
cv::dnn::NMSBoxes(bbox, outProbabillities, 0.3f, 0.4f, indexes); |
||||||
|
|
||||||
|
textbox_draw(image, bbox, outProbabillities, indexes); |
||||||
|
|
||||||
|
imshow("TextBox Demo",image); |
||||||
|
std::cout << "Done!" << std::endl << std::endl; |
||||||
|
std::cout << "Press any key to exit." << std::endl << std::endl; |
||||||
|
waitKey(); |
||||||
|
return 0; |
||||||
|
} |
@ -0,0 +1,94 @@ |
|||||||
|
// This file is part of OpenCV project.
|
||||||
|
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||||
|
// of this distribution and at http://opencv.org/license.html.
|
||||||
|
|
||||||
|
#include "precomp.hpp" |
||||||
|
#include "opencv2/imgproc.hpp" |
||||||
|
#include "opencv2/core.hpp" |
||||||
|
#include "opencv2/dnn.hpp" |
||||||
|
|
||||||
|
#include <fstream> |
||||||
|
#include <algorithm> |
||||||
|
|
||||||
|
using namespace cv::dnn; |
||||||
|
|
||||||
|
namespace cv |
||||||
|
{ |
||||||
|
namespace text |
||||||
|
{ |
||||||
|
|
||||||
|
class TextDetectorCNNImpl : public TextDetectorCNN |
||||||
|
{ |
||||||
|
protected: |
||||||
|
Net net_; |
||||||
|
std::vector<Size> sizes_; |
||||||
|
int inputChannelCount_; |
||||||
|
|
||||||
|
void getOutputs(const float* buffer,int nbrTextBoxes,int nCol, |
||||||
|
std::vector<Rect>& Bbox, std::vector<float>& confidence, Size inputShape) |
||||||
|
{ |
||||||
|
for(int k = 0; k < nbrTextBoxes; k++) |
||||||
|
{ |
||||||
|
float x_min = buffer[k*nCol + 3]*inputShape.width; |
||||||
|
float y_min = buffer[k*nCol + 4]*inputShape.height; |
||||||
|
|
||||||
|
float x_max = buffer[k*nCol + 5]*inputShape.width; |
||||||
|
float y_max = buffer[k*nCol + 6]*inputShape.height; |
||||||
|
|
||||||
|
CV_Assert(x_min < x_max, y_min < y_max); |
||||||
|
|
||||||
|
x_min = std::max(0.f, x_min); |
||||||
|
y_min = std::max(0.f, y_min); |
||||||
|
|
||||||
|
x_max = std::min(inputShape.width - 1.f, x_max); |
||||||
|
y_max = std::min(inputShape.height - 1.f, y_max); |
||||||
|
|
||||||
|
int wd = cvRound(x_max - x_min); |
||||||
|
int ht = cvRound(y_max - y_min); |
||||||
|
|
||||||
|
Bbox.push_back(Rect(cvRound(x_min), cvRound(y_min), wd, ht)); |
||||||
|
confidence.push_back(buffer[k*nCol + 2]); |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
public: |
||||||
|
TextDetectorCNNImpl(const String& modelArchFilename, const String& modelWeightsFilename, std::vector<Size> detectionSizes) : |
||||||
|
sizes_(detectionSizes) |
||||||
|
{ |
||||||
|
net_ = readNetFromCaffe(modelArchFilename, modelWeightsFilename); |
||||||
|
CV_Assert(!net_.empty()); |
||||||
|
inputChannelCount_ = 3; |
||||||
|
} |
||||||
|
|
||||||
|
void detect(InputArray inputImage_, std::vector<Rect>& Bbox, std::vector<float>& confidence) |
||||||
|
{ |
||||||
|
CV_Assert(inputImage_.channels() == inputChannelCount_); |
||||||
|
Mat inputImage = inputImage_.getMat(); |
||||||
|
Bbox.resize(0); |
||||||
|
confidence.resize(0); |
||||||
|
|
||||||
|
for(size_t i = 0; i < sizes_.size(); i++) |
||||||
|
{ |
||||||
|
Size inputGeometry = sizes_[i]; |
||||||
|
net_.setInput(blobFromImage(inputImage, 1, inputGeometry, Scalar(123, 117, 104), false, false), "data"); |
||||||
|
Mat outputNet = net_.forward(); |
||||||
|
int nbrTextBoxes = outputNet.size[2]; |
||||||
|
int nCol = outputNet.size[3]; |
||||||
|
int outputChannelCount = outputNet.size[1]; |
||||||
|
CV_Assert(outputChannelCount == 1); |
||||||
|
getOutputs((float*)(outputNet.data), nbrTextBoxes, nCol, Bbox, confidence, inputImage.size()); |
||||||
|
} |
||||||
|
} |
||||||
|
}; |
||||||
|
|
||||||
|
Ptr<TextDetectorCNN> TextDetectorCNN::create(const String &modelArchFilename, const String &modelWeightsFilename, std::vector<Size> detectionSizes) |
||||||
|
{ |
||||||
|
return makePtr<TextDetectorCNNImpl>(modelArchFilename, modelWeightsFilename, detectionSizes); |
||||||
|
} |
||||||
|
|
||||||
|
Ptr<TextDetectorCNN> TextDetectorCNN::create(const String &modelArchFilename, const String &modelWeightsFilename) |
||||||
|
{ |
||||||
|
return create(modelArchFilename, modelWeightsFilename, std::vector<Size>(1, Size(300, 300))); |
||||||
|
} |
||||||
|
} //namespace text
|
||||||
|
} //namespace cv
|
Loading…
Reference in new issue