Merge pull request #1399 from sovrasov:text_detector_dnn
commit
6651fb0b45
13 changed files with 2063 additions and 10 deletions
@ -0,0 +1,73 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef __OPENCV_TEXT_TEXTDETECTOR_HPP__ |
||||
#define __OPENCV_TEXT_TEXTDETECTOR_HPP__ |
||||
|
||||
#include"ocr.hpp" |
||||
|
||||
namespace cv |
||||
{ |
||||
namespace text |
||||
{ |
||||
|
||||
//! @addtogroup text_detect
|
||||
//! @{
|
||||
|
||||
/** @brief An abstract class providing interface for text detection algorithms
|
||||
*/ |
||||
class CV_EXPORTS_W TextDetector |
||||
{ |
||||
public: |
||||
/**
|
||||
@brief Method that provides a quick and simple interface to detect text inside an image |
||||
|
||||
@param inputImage an image to process |
||||
@param Bbox a vector of Rect that will store the detected word bounding box |
||||
@param confidence a vector of float that will be updated with the confidence the classifier has for the selected bounding box |
||||
*/ |
||||
CV_WRAP virtual void detect(InputArray inputImage, CV_OUT std::vector<Rect>& Bbox, CV_OUT std::vector<float>& confidence) = 0; |
||||
virtual ~TextDetector() {} |
||||
}; |
||||
|
||||
/** @brief TextDetectorCNN class provides the functionallity of text bounding box detection.
|
||||
This class is representing to find bounding boxes of text words given an input image. |
||||
This class uses OpenCV dnn module to load pre-trained model described in @cite LiaoSBWL17. |
||||
The original repository with the modified SSD Caffe version: https://github.com/MhLiao/TextBoxes.
|
||||
Model can be downloaded from [DropBox](https://www.dropbox.com/s/g8pjzv2de9gty8g/TextBoxes_icdar13.caffemodel?dl=0).
|
||||
Modified .prototxt file with the model description can be found in `opencv_contrib/modules/text/samples/textbox.prototxt`. |
||||
*/ |
||||
class CV_EXPORTS_W TextDetectorCNN : public TextDetector |
||||
{ |
||||
public: |
||||
/**
|
||||
@overload |
||||
|
||||
@param inputImage an image expected to be a CV_U8C3 of any size |
||||
@param Bbox a vector of Rect that will store the detected word bounding box |
||||
@param confidence a vector of float that will be updated with the confidence the classifier has for the selected bounding box |
||||
*/ |
||||
CV_WRAP virtual void detect(InputArray inputImage, CV_OUT std::vector<Rect>& Bbox, CV_OUT std::vector<float>& confidence) = 0; |
||||
|
||||
/** @brief Creates an instance of the TextDetectorCNN class using the provided parameters.
|
||||
|
||||
@param modelArchFilename the relative or absolute path to the prototxt file describing the classifiers architecture. |
||||
@param modelWeightsFilename the relative or absolute path to the file containing the pretrained weights of the model in caffe-binary form. |
||||
@param detectionSizes a list of sizes for multiscale detection. The values`[(300,300),(700,500),(700,300),(700,700),(1600,1600)]` are |
||||
recommended in @cite LiaoSBWL17 to achieve the best quality. |
||||
*/ |
||||
static Ptr<TextDetectorCNN> create(const String& modelArchFilename, const String& modelWeightsFilename, |
||||
std::vector<Size> detectionSizes); |
||||
/**
|
||||
@overload |
||||
*/ |
||||
CV_WRAP static Ptr<TextDetectorCNN> create(const String& modelArchFilename, const String& modelWeightsFilename); |
||||
}; |
||||
|
||||
//! @}
|
||||
}//namespace text
|
||||
}//namespace cv
|
||||
|
||||
|
||||
#endif // _OPENCV_TEXT_OCR_HPP_
|
@ -0,0 +1,37 @@ |
||||
# -*- coding: utf-8 -*- |
||||
#!/usr/bin/python |
||||
import sys |
||||
import os |
||||
import cv2 |
||||
import numpy as np |
||||
|
||||
def main(): |
||||
print('\nDeeptextdetection.py') |
||||
print(' A demo script of text box alogorithm of the paper:') |
||||
print(' * Minghui Liao et al.: TextBoxes: A Fast Text Detector with a Single Deep Neural Network https://arxiv.org/abs/1611.06779\n') |
||||
|
||||
if (len(sys.argv) < 2): |
||||
print(' (ERROR) You must call this script with an argument (path_to_image_to_be_processed)\n') |
||||
quit() |
||||
|
||||
if not os.path.isfile('TextBoxes_icdar13.caffemodel') or not os.path.isfile('textbox.prototxt'): |
||||
print " Model files not found in current directory. Aborting" |
||||
print " See the documentation of text::TextDetectorCNN class to get download links." |
||||
quit() |
||||
|
||||
img = cv2.imread(str(sys.argv[1])) |
||||
textSpotter = cv2.text.TextDetectorCNN_create("textbox.prototxt", "TextBoxes_icdar13.caffemodel") |
||||
rects, outProbs = textSpotter.detect(img); |
||||
vis = img.copy() |
||||
thres = 0.6 |
||||
|
||||
for r in range(np.shape(rects)[0]): |
||||
if outProbs[r] > thres: |
||||
rect = rects[r] |
||||
cv2.rectangle(vis, (rect[0],rect[1]), (rect[0] + rect[2], rect[1] + rect[3]), (255, 0, 0), 2) |
||||
|
||||
cv2.imshow("Text detection result", vis) |
||||
cv2.waitKey() |
||||
|
||||
if __name__ == "__main__": |
||||
main() |
@ -0,0 +1,122 @@ |
||||
#include <opencv2/text.hpp> |
||||
#include <opencv2/highgui.hpp> |
||||
#include <opencv2/imgproc.hpp> |
||||
#include <opencv2/dnn.hpp> |
||||
|
||||
#include <iostream> |
||||
#include <fstream> |
||||
|
||||
using namespace cv; |
||||
using namespace std; |
||||
|
||||
namespace |
||||
{ |
||||
void printHelpStr(const string& progFname) |
||||
{ |
||||
cout << " Demo of text recognition CNN for text detection." << endl |
||||
<< " Max Jaderberg et al.: Reading Text in the Wild with Convolutional Neural Networks, IJCV 2015"<<endl<<endl |
||||
<< " Usage: " << progFname << " <output_file> <input_image>" << endl |
||||
<< " Caffe Model files (textbox.prototxt, TextBoxes_icdar13.caffemodel)"<<endl |
||||
<< " must be in the current directory. See the documentation of text::TextDetectorCNN class to get download links." << endl |
||||
<< " Obtaining text recognition Caffe Model files in linux shell:" << endl |
||||
<< " wget http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg.caffemodel" << endl |
||||
<< " wget http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg_deploy.prototxt" << endl |
||||
<< " wget http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg_labels.txt" <<endl << endl; |
||||
} |
||||
|
||||
bool fileExists (const string& filename) |
||||
{ |
||||
ifstream f(filename.c_str()); |
||||
return f.good(); |
||||
} |
||||
|
||||
void textbox_draw(Mat src, std::vector<Rect>& groups, std::vector<float>& probs, std::vector<int>& indexes) |
||||
{ |
||||
for (size_t i = 0; i < indexes.size(); i++) |
||||
{ |
||||
if (src.type() == CV_8UC3) |
||||
{ |
||||
Rect currrentBox = groups[indexes[i]]; |
||||
rectangle(src, currrentBox, Scalar( 0, 255, 255 ), 2, LINE_AA); |
||||
String label = format("%.2f", probs[indexes[i]]); |
||||
std::cout << "text box: " << currrentBox << " confidence: " << probs[indexes[i]] << "\n"; |
||||
|
||||
int baseLine = 0; |
||||
Size labelSize = getTextSize(label, FONT_HERSHEY_PLAIN, 1, 1, &baseLine); |
||||
int yLeftBottom = std::max(currrentBox.y, labelSize.height); |
||||
rectangle(src, Point(currrentBox.x, yLeftBottom - labelSize.height), |
||||
Point(currrentBox.x + labelSize.width, yLeftBottom + baseLine), Scalar( 255, 255, 255 ), FILLED); |
||||
|
||||
putText(src, label, Point(currrentBox.x, yLeftBottom), FONT_HERSHEY_PLAIN, 1, Scalar( 0,0,0 ), 1, LINE_AA); |
||||
} |
||||
else |
||||
rectangle(src, groups[i], Scalar( 255 ), 3, 8 ); |
||||
} |
||||
} |
||||
|
||||
} |
||||
|
||||
int main(int argc, const char * argv[]) |
||||
{ |
||||
if (argc < 2) |
||||
{ |
||||
printHelpStr(argv[0]); |
||||
cout << "Insufiecient parameters. Aborting!" << endl; |
||||
exit(1); |
||||
} |
||||
|
||||
const string modelArch = "textbox.prototxt"; |
||||
const string moddelWeights = "TextBoxes_icdar13.caffemodel"; |
||||
|
||||
if (!fileExists(modelArch) || !fileExists(moddelWeights)) |
||||
{ |
||||
printHelpStr(argv[0]); |
||||
cout << "Model files not found in the current directory. Aborting!" << endl; |
||||
exit(1); |
||||
} |
||||
|
||||
Mat image = imread(String(argv[1]), IMREAD_COLOR); |
||||
|
||||
cout << "Starting Text Box Demo" << endl; |
||||
Ptr<text::TextDetectorCNN> textSpotter = |
||||
text::TextDetectorCNN::create(modelArch, moddelWeights); |
||||
|
||||
vector<Rect> bbox; |
||||
vector<float> outProbabillities; |
||||
textSpotter->detect(image, bbox, outProbabillities); |
||||
std::vector<int> indexes; |
||||
cv::dnn::NMSBoxes(bbox, outProbabillities, 0.4f, 0.5f, indexes); |
||||
|
||||
Mat image_copy = image.clone(); |
||||
textbox_draw(image_copy, bbox, outProbabillities, indexes); |
||||
imshow("Text detection", image_copy); |
||||
image_copy = image.clone(); |
||||
|
||||
Ptr<text::OCRHolisticWordRecognizer> wordSpotter = |
||||
text::OCRHolisticWordRecognizer::create("dictnet_vgg_deploy.prototxt", "dictnet_vgg.caffemodel", "dictnet_vgg_labels.txt"); |
||||
|
||||
for(size_t i = 0; i < indexes.size(); i++) |
||||
{ |
||||
Mat wordImg; |
||||
cvtColor(image(bbox[indexes[i]]), wordImg, COLOR_BGR2GRAY); |
||||
string word; |
||||
vector<float> confs; |
||||
wordSpotter->run(wordImg, word, NULL, NULL, &confs); |
||||
|
||||
Rect currrentBox = bbox[indexes[i]]; |
||||
rectangle(image_copy, currrentBox, Scalar( 0, 255, 255 ), 2, LINE_AA); |
||||
|
||||
int baseLine = 0; |
||||
Size labelSize = getTextSize(word, FONT_HERSHEY_PLAIN, 1, 1, &baseLine); |
||||
int yLeftBottom = std::max(currrentBox.y, labelSize.height); |
||||
rectangle(image_copy, Point(currrentBox.x, yLeftBottom - labelSize.height), |
||||
Point(currrentBox.x + labelSize.width, yLeftBottom + baseLine), Scalar( 255, 255, 255 ), FILLED); |
||||
|
||||
putText(image_copy, word, Point(currrentBox.x, yLeftBottom), FONT_HERSHEY_PLAIN, 1, Scalar( 0,0,0 ), 1, LINE_AA); |
||||
|
||||
} |
||||
imshow("Text recognition", image_copy); |
||||
cout << "Recognition finished. Press any key to exit.\n"; |
||||
waitKey(); |
||||
return 0; |
||||
} |
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,96 @@ |
||||
#include <opencv2/text.hpp> |
||||
#include <opencv2/highgui.hpp> |
||||
#include <opencv2/imgproc.hpp> |
||||
#include <opencv2/dnn.hpp> |
||||
|
||||
#include <sstream> |
||||
#include <iostream> |
||||
#include <fstream> |
||||
|
||||
using namespace cv; |
||||
|
||||
namespace |
||||
{ |
||||
std::string getHelpStr(const std::string& progFname) |
||||
{ |
||||
std::stringstream out; |
||||
out << " Demo of text detection CNN for text detection." << std::endl |
||||
<< " Minghui Liao, Baoguang Shi, Xiang Bai, Xinggang Wang, Wenyu Liu: TextBoxes: A Fast Text Detector with a Single Deep Neural Network, AAAI2017\n\n" |
||||
<< " Usage: " << progFname << " <output_file> <input_image>" << std::endl |
||||
<< " Caffe Model files (textbox.prototxt, TextBoxes_icdar13.caffemodel)"<<std::endl |
||||
<< " must be in the current directory. See the documentation of text::TextDetectorCNN class to get download links." << std::endl; |
||||
return out.str(); |
||||
} |
||||
|
||||
bool fileExists (const std::string& filename) |
||||
{ |
||||
std::ifstream f(filename.c_str()); |
||||
return f.good(); |
||||
} |
||||
|
||||
void textbox_draw(Mat src, std::vector<Rect>& groups, std::vector<float>& probs, std::vector<int>& indexes) |
||||
{ |
||||
for (size_t i = 0; i < indexes.size(); i++) |
||||
{ |
||||
if (src.type() == CV_8UC3) |
||||
{ |
||||
Rect currrentBox = groups[indexes[i]]; |
||||
rectangle(src, currrentBox, Scalar( 0, 255, 255 ), 2, LINE_AA); |
||||
String label = format("%.2f", probs[indexes[i]]); |
||||
std::cout << "text box: " << currrentBox << " confidence: " << probs[indexes[i]] << "\n"; |
||||
|
||||
int baseLine = 0; |
||||
Size labelSize = getTextSize(label, FONT_HERSHEY_PLAIN, 1, 1, &baseLine); |
||||
int yLeftBottom = std::max(currrentBox.y, labelSize.height); |
||||
rectangle(src, Point(currrentBox.x, yLeftBottom - labelSize.height), |
||||
Point(currrentBox.x + labelSize.width, yLeftBottom + baseLine), Scalar( 255, 255, 255 ), FILLED); |
||||
|
||||
putText(src, label, Point(currrentBox.x, yLeftBottom), FONT_HERSHEY_PLAIN, 1, Scalar( 0,0,0 ), 1, LINE_AA); |
||||
} |
||||
else |
||||
rectangle(src, groups[i], Scalar( 255 ), 3, 8 ); |
||||
} |
||||
} |
||||
|
||||
} |
||||
|
||||
int main(int argc, const char * argv[]) |
||||
{ |
||||
if (argc < 2) |
||||
{ |
||||
std::cout << getHelpStr(argv[0]); |
||||
std::cout << "Insufiecient parameters. Aborting!" << std::endl; |
||||
exit(1); |
||||
} |
||||
|
||||
const std::string modelArch = "textbox.prototxt"; |
||||
const std::string moddelWeights = "TextBoxes_icdar13.caffemodel"; |
||||
|
||||
if (!fileExists(modelArch) || !fileExists(moddelWeights)) |
||||
{ |
||||
std::cout << getHelpStr(argv[0]); |
||||
std::cout << "Model files not found in the current directory. Aborting!" << std::endl; |
||||
exit(1); |
||||
} |
||||
|
||||
Mat image = imread(String(argv[1]), IMREAD_COLOR); |
||||
|
||||
std::cout << "Starting Text Box Demo" << std::endl; |
||||
Ptr<text::TextDetectorCNN> textSpotter = |
||||
text::TextDetectorCNN::create(modelArch, moddelWeights); |
||||
|
||||
std::vector<Rect> bbox; |
||||
std::vector<float> outProbabillities; |
||||
textSpotter->detect(image, bbox, outProbabillities); |
||||
|
||||
std::vector<int> indexes; |
||||
cv::dnn::NMSBoxes(bbox, outProbabillities, 0.3f, 0.4f, indexes); |
||||
|
||||
textbox_draw(image, bbox, outProbabillities, indexes); |
||||
|
||||
imshow("TextBox Demo",image); |
||||
std::cout << "Done!" << std::endl << std::endl; |
||||
std::cout << "Press any key to exit." << std::endl << std::endl; |
||||
waitKey(); |
||||
return 0; |
||||
} |
@ -0,0 +1,94 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#include "precomp.hpp" |
||||
#include "opencv2/imgproc.hpp" |
||||
#include "opencv2/core.hpp" |
||||
#include "opencv2/dnn.hpp" |
||||
|
||||
#include <fstream> |
||||
#include <algorithm> |
||||
|
||||
using namespace cv::dnn; |
||||
|
||||
namespace cv |
||||
{ |
||||
namespace text |
||||
{ |
||||
|
||||
class TextDetectorCNNImpl : public TextDetectorCNN |
||||
{ |
||||
protected: |
||||
Net net_; |
||||
std::vector<Size> sizes_; |
||||
int inputChannelCount_; |
||||
|
||||
void getOutputs(const float* buffer,int nbrTextBoxes,int nCol, |
||||
std::vector<Rect>& Bbox, std::vector<float>& confidence, Size inputShape) |
||||
{ |
||||
for(int k = 0; k < nbrTextBoxes; k++) |
||||
{ |
||||
float x_min = buffer[k*nCol + 3]*inputShape.width; |
||||
float y_min = buffer[k*nCol + 4]*inputShape.height; |
||||
|
||||
float x_max = buffer[k*nCol + 5]*inputShape.width; |
||||
float y_max = buffer[k*nCol + 6]*inputShape.height; |
||||
|
||||
CV_Assert(x_min < x_max, y_min < y_max); |
||||
|
||||
x_min = std::max(0.f, x_min); |
||||
y_min = std::max(0.f, y_min); |
||||
|
||||
x_max = std::min(inputShape.width - 1.f, x_max); |
||||
y_max = std::min(inputShape.height - 1.f, y_max); |
||||
|
||||
int wd = cvRound(x_max - x_min); |
||||
int ht = cvRound(y_max - y_min); |
||||
|
||||
Bbox.push_back(Rect(cvRound(x_min), cvRound(y_min), wd, ht)); |
||||
confidence.push_back(buffer[k*nCol + 2]); |
||||
} |
||||
} |
||||
|
||||
public: |
||||
TextDetectorCNNImpl(const String& modelArchFilename, const String& modelWeightsFilename, std::vector<Size> detectionSizes) : |
||||
sizes_(detectionSizes) |
||||
{ |
||||
net_ = readNetFromCaffe(modelArchFilename, modelWeightsFilename); |
||||
CV_Assert(!net_.empty()); |
||||
inputChannelCount_ = 3; |
||||
} |
||||
|
||||
void detect(InputArray inputImage_, std::vector<Rect>& Bbox, std::vector<float>& confidence) |
||||
{ |
||||
CV_Assert(inputImage_.channels() == inputChannelCount_); |
||||
Mat inputImage = inputImage_.getMat(); |
||||
Bbox.resize(0); |
||||
confidence.resize(0); |
||||
|
||||
for(size_t i = 0; i < sizes_.size(); i++) |
||||
{ |
||||
Size inputGeometry = sizes_[i]; |
||||
net_.setInput(blobFromImage(inputImage, 1, inputGeometry, Scalar(123, 117, 104), false, false), "data"); |
||||
Mat outputNet = net_.forward(); |
||||
int nbrTextBoxes = outputNet.size[2]; |
||||
int nCol = outputNet.size[3]; |
||||
int outputChannelCount = outputNet.size[1]; |
||||
CV_Assert(outputChannelCount == 1); |
||||
getOutputs((float*)(outputNet.data), nbrTextBoxes, nCol, Bbox, confidence, inputImage.size()); |
||||
} |
||||
} |
||||
}; |
||||
|
||||
Ptr<TextDetectorCNN> TextDetectorCNN::create(const String &modelArchFilename, const String &modelWeightsFilename, std::vector<Size> detectionSizes) |
||||
{ |
||||
return makePtr<TextDetectorCNNImpl>(modelArchFilename, modelWeightsFilename, detectionSizes); |
||||
} |
||||
|
||||
Ptr<TextDetectorCNN> TextDetectorCNN::create(const String &modelArchFilename, const String &modelWeightsFilename) |
||||
{ |
||||
return create(modelArchFilename, modelWeightsFilename, std::vector<Size>(1, Size(300, 300))); |
||||
} |
||||
} //namespace text
|
||||
} //namespace cv
|
Loading…
Reference in new issue