diff --git a/modules/text/CMakeLists.txt b/modules/text/CMakeLists.txt index ecb3a7f52..4eea5c878 100644 --- a/modules/text/CMakeLists.txt +++ b/modules/text/CMakeLists.txt @@ -1,2 +1,25 @@ +set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_SOURCE_DIR}) + +find_package(Tesseract) +if(Tesseract_FOUND) + message(STATUS "Tesseract: YES") + set(HAVE_TESSERACT 1) +else() + message(STATUS "Tesseract: NO") +endif() + +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/text_config.hpp.in + ${CMAKE_BINARY_DIR}/text_config.hpp @ONLY) + +include_directories(${CMAKE_CURRENT_BINARY_DIR}) + +if(${Tesseract_FOUND}) +include_directories(${Tesseract_INCLUDE_DIR}) +endif() + set(the_description "Text Detection and Recognition") ocv_define_module(text opencv_ml opencv_highgui opencv_imgproc opencv_core) + +if(${Tesseract_FOUND}) + target_link_libraries(opencv_text ${Tesseract_LIBS}) +endif() diff --git a/modules/text/FindTesseract.cmake b/modules/text/FindTesseract.cmake new file mode 100644 index 000000000..54c4a4929 --- /dev/null +++ b/modules/text/FindTesseract.cmake @@ -0,0 +1,24 @@ +# Tesseract OCR +unset(Tesseract_FOUND) + +find_path(Tesseract_INCLUDE_DIR tesseract/baseapi.h + HINTS + /usr/include + /usr/local/include) + +find_library(Tesseract_LIBRARY NAMES tesseract + HINTS + /usr/lib + /usr/local/lib) + +find_library(Lept_LIBRARY NAMES lept + HINTS + /usr/lib + /usr/local/lib) + +set(Tesseract_LIBS ${Tesseract_LIBRARY} ${Lept_LIBRARY}) +if(Tesseract_LIBS AND Tesseract_INCLUDE_DIR) + set(Tesseract_FOUND 1) +endif() + + diff --git a/modules/text/include/opencv2/text.hpp b/modules/text/include/opencv2/text.hpp index 0e7252522..e18e5631e 100644 --- a/modules/text/include/opencv2/text.hpp +++ b/modules/text/include/opencv2/text.hpp @@ -40,5 +40,6 @@ the use of this software, even if advised of the possibility of such damage. #define __OPENCV_TEXT_HPP__ #include "opencv2/text/erfilter.hpp" +#include "opencv2/text/ocr.hpp" #endif diff --git a/modules/text/include/opencv2/text/ocr.hpp b/modules/text/include/opencv2/text/ocr.hpp new file mode 100644 index 000000000..f1c593116 --- /dev/null +++ b/modules/text/include/opencv2/text/ocr.hpp @@ -0,0 +1,110 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 2013, OpenCV Foundation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef __OPENCV_TEXT_OCR_HPP__ +#define __OPENCV_TEXT_OCR_HPP__ + +#include "text_config.hpp" + +#ifdef HAVE_TESSERACT +#include +#include +#endif + +#include "opencv2/core.hpp" +#include +#include + + +namespace cv +{ +namespace text +{ + +using namespace std; + +enum +{ + OCR_LEVEL_WORD, + OCR_LEVEL_TEXTLINE +}; + +#ifdef HAVE_TESSERACT +class CV_EXPORTS OCRTesseract +{ +private: + tesseract::TessBaseAPI tess; + +public: + //Default constructor + OCRTesseract(const char* datapath=NULL, const char* language=NULL, const char* char_whitelist=NULL, + tesseract::OcrEngineMode oem=tesseract::OEM_DEFAULT, tesseract::PageSegMode psmode=tesseract::PSM_AUTO); + + ~OCRTesseract(); + + void run(Mat& image, string& output_text, vector* component_rects=NULL, + vector* component_texts=NULL, vector* component_confidences=NULL, + int component_level=0); +}; +#else +//stub +class CV_EXPORTS OCRTesseract +{ +public: + //Default constructor + OCRTesseract(const char* datapath=NULL, const char* language=NULL, const char* char_whitelist=NULL, + int oem=0, int psmode=0); + + ~OCRTesseract(); + + void run(Mat& image, string& output_text, vector* component_rects=NULL, + vector* component_texts=NULL, vector* component_confidences=NULL, + int component_level=0); +}; +#endif + + + +} +} +#endif // _OPENCV_TEXT_OCR_HPP_ diff --git a/modules/text/samples/end_to_end_recognition.cpp b/modules/text/samples/end_to_end_recognition.cpp new file mode 100644 index 000000000..5cda0cb8a --- /dev/null +++ b/modules/text/samples/end_to_end_recognition.cpp @@ -0,0 +1,343 @@ +/* + * textdetection.cpp + * + * A demo program of End-to-end Scene Text Detection and Recognition: + * Shows the use of the Tesseract OCR API with the Extremal Region Filter algorithm described in: + * Neumann L., Matas J.: Real-Time Scene Text Localization and Recognition, CVPR 2012 + * + * Created on: Jul 31, 2014 + * Author: Lluis Gomez i Bigorda + */ + +#include "opencv2/text.hpp" +#include "opencv2/core/utility.hpp" +#include "opencv2/highgui.hpp" +#include "opencv2/imgproc.hpp" + +#include + +using namespace std; +using namespace cv; +using namespace cv::text; + +//Calculate edit distance netween two words +size_t edit_distance(const string& A, const string& B); +size_t min(size_t x, size_t y, size_t z); +bool isRepetitive(const string& s); +bool sort_by_lenght(const string &a, const string &b); +//Draw ER's in an image via floodFill +void er_draw(vector &channels, vector > ®ions, vector group, Mat& segmentation); + +//Perform text detection and recognition and evaluate results using edit distance +int main(int argc, char* argv[]) +{ + cout << endl << argv[0] << endl << endl; + cout << "A demo program of End-to-end Scene Text Detection and Recognition: " << endl; + cout << "Shows the use of the Tesseract OCR API with the Extremal Region Filter algorithm described in:" << endl; + cout << "Neumann L., Matas J.: Real-Time Scene Text Localization and Recognition, CVPR 2012" << endl << endl; + + Mat image; + if(argc>1) + image = imread(argv[1]); + else + { + cout << " Usage: " << argv[0] << " [ ... ]" << endl; + return(0); + } + + cout << "IMG_W=" << image.cols << endl; + cout << "IMG_H=" << image.rows << endl; + + /*Text Detection*/ + + // Extract channels to be processed individually + vector channels; + + Mat grey; + cvtColor(image,grey,COLOR_RGB2GRAY); + + // Notice here we are only using grey channel, see textdetection.cpp for example with more channels + channels.push_back(grey); + channels.push_back(255-grey); + + double t_d = getTickCount(); + // Create ERFilter objects with the 1st and 2nd stage default classifiers + Ptr er_filter1 = createERFilterNM1(loadClassifierNM1("trained_classifierNM1.xml"),8,0.00015,0.13,0.2,true,0.1); + Ptr er_filter2 = createERFilterNM2(loadClassifierNM2("trained_classifierNM2.xml"),0.5); + + vector > regions(channels.size()); + // Apply the default cascade classifier to each independent channel (could be done in parallel) + for (int c=0; c<(int)channels.size(); c++) + { + er_filter1->run(channels[c], regions[c]); + er_filter2->run(channels[c], regions[c]); + } + cout << "TIME_REGION_DETECTION = " << ((double)getTickCount() - t_d)*1000/getTickFrequency() << endl; + + Mat out_img_decomposition= Mat::zeros(image.rows+2, image.cols+2, CV_8UC1); + vector tmp_group; + for (int i=0; i<(int)regions.size(); i++) + { + for (int j=0; j<(int)regions[i].size();j++) + { + tmp_group.push_back(Vec2i(i,j)); + } + Mat tmp= Mat::zeros(image.rows+2, image.cols+2, CV_8UC1); + er_draw(channels, regions, tmp_group, tmp); + if (i > 0) + tmp = tmp / 2; + out_img_decomposition = out_img_decomposition | tmp; + tmp_group.clear(); + } + + double t_g = getTickCount(); + // Detect character groups + vector< vector > nm_region_groups; + vector nm_boxes; + erGrouping(image, channels, regions, nm_region_groups, nm_boxes,ERGROUPING_ORIENTATION_HORIZ); + cout << "TIME_GROUPING = " << ((double)getTickCount() - t_g)*1000/getTickFrequency() << endl; + + + + /*Text Recognition (OCR)*/ + + double t_r = getTickCount(); + OCRTesseract* ocr = new OCRTesseract(); + cout << "TIME_OCR_INITIALIZATION = " << ((double)getTickCount() - t_r)*1000/getTickFrequency() << endl; + string output; + + Mat out_img; + Mat out_img_detection; + Mat out_img_segmentation = Mat::zeros(image.rows+2, image.cols+2, CV_8UC1); + image.copyTo(out_img); + image.copyTo(out_img_detection); + float scale_img = 600./image.rows; + float scale_font = (2-scale_img)/1.4; + vector words_detection; + + t_r = getTickCount(); + + for (int i=0; i<(int)nm_boxes.size(); i++) + { + + rectangle(out_img_detection, nm_boxes[i].tl(), nm_boxes[i].br(), Scalar(0,255,255), 3); + + Mat group_img = Mat::zeros(image.rows+2, image.cols+2, CV_8UC1); + er_draw(channels, regions, nm_region_groups[i], group_img); + Mat group_segmentation; + group_img.copyTo(group_segmentation); + //image(nm_boxes[i]).copyTo(group_img); + group_img(nm_boxes[i]).copyTo(group_img); + copyMakeBorder(group_img,group_img,15,15,15,15,BORDER_CONSTANT,Scalar(0)); + + vector boxes; + vector words; + vector confidences; + ocr->run(group_img, output, &boxes, &words, &confidences, OCR_LEVEL_WORD); + + output.erase(remove(output.begin(), output.end(), '\n'), output.end()); + //cout << "OCR output = \"" << output << "\" lenght = " << output.size() << endl; + if (output.size() < 3) + continue; + + for (int j=0; j<(int)boxes.size(); j++) + { + boxes[j].x += nm_boxes[i].x-15; + boxes[j].y += nm_boxes[i].y-15; + + //cout << " word = " << words[j] << "\t confidence = " << confidences[j] << endl; + if ((words[j].size() < 2) || (confidences[j] < 51) || + ((words[j].size()==2) && (words[j][0] == words[j][1])) || + ((words[j].size()< 4) && (confidences[j] < 60)) || + isRepetitive(words[j])) + continue; + words_detection.push_back(words[j]); + rectangle(out_img, boxes[j].tl(), boxes[j].br(), Scalar(255,0,255),3); + Size word_size = getTextSize(words[j], FONT_HERSHEY_SIMPLEX, scale_font, 3*scale_font, NULL); + rectangle(out_img, boxes[j].tl()-Point(3,word_size.height+3), boxes[j].tl()+Point(word_size.width,0), Scalar(255,0,255),-1); + putText(out_img, words[j], boxes[j].tl()-Point(1,1), FONT_HERSHEY_SIMPLEX, scale_font, Scalar(255,255,255),3*scale_font); + out_img_segmentation = out_img_segmentation | group_segmentation; + } + + } + + cout << "TIME_OCR = " << ((double)getTickCount() - t_r)*1000/getTickFrequency() << endl; + + + /* Recognition evaluation with (approximate) hungarian matching and edit distances */ + + if(argc>2) + { + int num_gt_characters = 0; + vector words_gt; + for (int i=2; i 0) + { + words_gt.push_back(string(argv[i])); + //cout << " GT word " << words_gt[words_gt.size()-1] << endl; + num_gt_characters += words_gt[words_gt.size()-1].size(); + } + } + + if (words_detection.empty()) + { + //cout << endl << "number of characters in gt = " << num_gt_characters << endl; + cout << "TOTAL_EDIT_DISTANCE = " << num_gt_characters << endl; + cout << "EDIT_DISTANCE_RATIO = 1" << endl; + } + else + { + + sort(words_gt.begin(),words_gt.end(),sort_by_lenght); + + int max_dist=0; + vector< vector > assignment_mat; + for (int i=0; i<(int)words_gt.size(); i++) + { + vector assignment_row(words_detection.size(),0); + assignment_mat.push_back(assignment_row); + for (int j=0; j<(int)words_detection.size(); j++) + { + assignment_mat[i][j] = edit_distance(words_gt[i],words_detection[j]); + max_dist = max(max_dist,assignment_mat[i][j]); + } + } + + vector words_detection_matched; + + int total_edit_distance = 0; + int tp=0, fp=0, fn=0; + for (int search_dist=0; search_dist<=max_dist; search_dist++) + { + for (int i=0; i<(int)assignment_mat.size(); i++) + { + int min_dist_idx = distance(assignment_mat[i].begin(), + min_element(assignment_mat[i].begin(),assignment_mat[i].end())); + if (assignment_mat[i][min_dist_idx] == search_dist) + { + //cout << " GT word \"" << words_gt[i] << "\" best match \"" << words_detection[min_dist_idx] << "\" with dist " << assignment_mat[i][min_dist_idx] << endl; + if(search_dist == 0) + tp++; + else { fp++; fn++; } + + total_edit_distance += assignment_mat[i][min_dist_idx]; + words_detection_matched.push_back(min_dist_idx); + words_gt.erase(words_gt.begin()+i); + assignment_mat.erase(assignment_mat.begin()+i); + for (int j=0; j<(int)assignment_mat.size(); j++) + { + assignment_mat[j][min_dist_idx]=INT_MAX; + } + i--; + } + } + } + + for (int j=0; j<(int)words_gt.size(); j++) + { + //cout << " GT word \"" << words_gt[j] << "\" no match found" << endl; + fn++; + total_edit_distance += words_gt[j].size(); + } + for (int j=0; j<(int)words_detection.size(); j++) + { + if (find(words_detection_matched.begin(),words_detection_matched.end(),j) == words_detection_matched.end()) + { + //cout << " Detection word \"" << words_detection[j] << "\" no match found" << endl; + fp++; + total_edit_distance += words_detection[j].size(); + } + } + + + //cout << endl << "number of characters in gt = " << num_gt_characters << endl; + cout << "TOTAL_EDIT_DISTANCE = " << total_edit_distance << endl; + cout << "EDIT_DISTANCE_RATIO = " << (float)total_edit_distance / num_gt_characters << endl; + cout << "TP = " << tp << endl; + cout << "FP = " << fp << endl; + cout << "FN = " << fn << endl; + } + } + + + + //resize(out_img_detection,out_img_detection,Size(image.cols*scale_img,image.rows*scale_img)); + //imshow("detection", out_img_detection); + //imwrite("detection.jpg", out_img_detection); + //resize(out_img,out_img,Size(image.cols*scale_img,image.rows*scale_img)); + namedWindow("recognition",WINDOW_NORMAL); + imshow("recognition", out_img); + waitKey(0); + //imwrite("recognition.jpg", out_img); + //imwrite("segmentation.jpg", out_img_segmentation); + //imwrite("decomposition.jpg", out_img_decomposition); + + return 0; +} + +size_t min(size_t x, size_t y, size_t z) +{ + return x < y ? min(x,z) : min(y,z); +} + +size_t edit_distance(const string& A, const string& B) +{ + size_t NA = A.size(); + size_t NB = B.size(); + + vector< vector > M(NA + 1, vector(NB + 1)); + + for (size_t a = 0; a <= NA; ++a) + M[a][0] = a; + + for (size_t b = 0; b <= NB; ++b) + M[0][b] = b; + + for (size_t a = 1; a <= NA; ++a) + for (size_t b = 1; b <= NB; ++b) + { + size_t x = M[a-1][b] + 1; + size_t y = M[a][b-1] + 1; + size_t z = M[a-1][b-1] + (A[a-1] == B[b-1] ? 0 : 1); + M[a][b] = min(x,y,z); + } + + return M[A.size()][B.size()]; +} + +bool isRepetitive(const string& s) +{ + int count = 0; + for (int i=0; i<(int)s.size(); i++) + { + if ((s[i] == 'i') || + (s[i] == 'l') || + (s[i] == 'I')) + count++; + } + if (count > ((int)s.size()+1)/2) + { + return true; + } + return false; +} + + +void er_draw(vector &channels, vector > ®ions, vector group, Mat& segmentation) +{ + for (int r=0; r<(int)group.size(); r++) + { + ERStat er = regions[group[r][0]][group[r][1]]; + if (er.parent != NULL) // deprecate the root region + { + int newMaskVal = 255; + int flags = 4 + (newMaskVal << 8) + FLOODFILL_FIXED_RANGE + FLOODFILL_MASK_ONLY; + floodFill(channels[group[r][0]],segmentation,Point(er.pixel%channels[group[r][0]].cols,er.pixel/channels[group[r][0]].cols), + Scalar(255),0,Scalar(er.level),Scalar(0),flags); + } + } +} + +bool sort_by_lenght(const string &a, const string &b){return (a.size()>b.size());} diff --git a/modules/text/src/ocr.cpp b/modules/text/src/ocr.cpp new file mode 100644 index 000000000..8b06f69bd --- /dev/null +++ b/modules/text/src/ocr.cpp @@ -0,0 +1,177 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "precomp.hpp" +#include "opencv2/imgproc.hpp" +#include "opencv2/ml.hpp" + +#include +#include +#include + +namespace cv +{ +namespace text +{ + +using namespace std; + + +#ifdef HAVE_TESSERACT +//Default constructor +OCRTesseract::OCRTesseract(const char* datapath, const char* language, const char* char_whitelist, tesseract::OcrEngineMode oemode, tesseract::PageSegMode psmode) +{ + + const char *lang = "eng"; + if (language != NULL) + lang = language; + + if (tess.Init(datapath, lang, oemode)) + { + cout << "OCRTesseract: Could not initialize tesseract." << endl; + throw 1; + } + + //cout << "OCRTesseract: tesseract version " << tess.Version() << endl; + + tesseract::PageSegMode pagesegmode = psmode; + tess.SetPageSegMode(pagesegmode); + + if(char_whitelist != NULL) + tess.SetVariable("tessedit_char_whitelist", char_whitelist); + else + tess.SetVariable("tessedit_char_whitelist", "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"); + + tess.SetVariable("save_best_choices", "T"); + +} + +OCRTesseract::~OCRTesseract() +{ + tess.End(); +} + +void OCRTesseract::run(Mat& image, string& output, vector* component_rects, + vector* component_texts, vector* component_confidences, int component_level) +{ + CV_Assert( (image.type() == CV_8UC1) || (image.type() == CV_8UC1) ); + if (component_texts != 0) + component_texts->clear(); + if (component_rects != 0) + component_rects->clear(); + if (component_confidences != 0) + component_confidences->clear(); + + tess.SetImage((uchar*)image.data, image.size().width, image.size().height, image.channels(), image.step1()); + tess.Recognize(0); + output = string(tess.GetUTF8Text()); + + if ( (component_rects != NULL) || (component_texts != NULL) || (component_confidences != NULL) ) + { + tesseract::ResultIterator* ri = tess.GetIterator(); + tesseract::PageIteratorLevel level = tesseract::RIL_WORD; + if (component_level == OCR_LEVEL_TEXTLINE) + level = tesseract::RIL_TEXTLINE; + + if (ri != 0) { + do { + const char* word = ri->GetUTF8Text(level); + if (word == NULL) + continue; + float conf = ri->Confidence(level); + int x1, y1, x2, y2; + ri->BoundingBox(level, &x1, &y1, &x2, &y2); + + if (component_texts != 0) + component_texts->push_back(string(word)); + if (component_rects != 0) + component_rects->push_back(Rect(x1,y1,x2-x1,y2-y1)); + if (component_confidences != 0) + component_confidences->push_back(conf); + + delete[] word; + } while (ri->Next(level)); + } + delete ri; + } + + tess.Clear(); +} +#else +//Stub constructor +OCRTesseract::OCRTesseract(const char* datapath, const char* language, const char* char_whitelist, int oemode, int psmode) +{ + cout << "OCRTesseract("<* component_rects, + vector* component_texts, vector* component_confidences, int component_level) +{ + CV_Assert( (image.type() == CV_8UC1) || (image.type() == CV_8UC1) ); + + cout << "OCRTesseract(" << component_level << image.type() <<"): Tesseract not found." << endl; + output.clear(); + if(component_rects) + component_rects->clear(); + if(component_texts) + component_texts->clear(); + if(component_confidences) + component_confidences->clear(); +} +#endif + + + +} +} diff --git a/modules/text/text_config.hpp.in b/modules/text/text_config.hpp.in new file mode 100644 index 000000000..30089bd3c --- /dev/null +++ b/modules/text/text_config.hpp.in @@ -0,0 +1,7 @@ +#ifndef __OPENCV_TEXT_CONFIG_HPP__ +#define __OPENCV_TEXT_CONFIG_HPP__ + +// HAVE OCR Tesseract +#cmakedefine HAVE_TESSERACT + +#endif \ No newline at end of file