diff --git a/modules/text/include/opencv2/text/ocr.hpp b/modules/text/include/opencv2/text/ocr.hpp index 795640e79..0155360de 100644 --- a/modules/text/include/opencv2/text/ocr.hpp +++ b/modules/text/include/opencv2/text/ocr.hpp @@ -69,6 +69,9 @@ public: virtual void run(Mat& image, std::string& output_text, std::vector* component_rects=NULL, std::vector* component_texts=NULL, std::vector* component_confidences=NULL, int component_level=0) = 0; + virtual void run(Mat& image, Mat& mask, std::string& output_text, std::vector* component_rects=NULL, + std::vector* component_texts=NULL, std::vector* component_confidences=NULL, + int component_level=0) = 0; }; /** @brief OCRTesseract class provides an interface with the tesseract-ocr API (v3.02.02) in C++. @@ -106,6 +109,10 @@ public: std::vector* component_texts=NULL, std::vector* component_confidences=NULL, int component_level=0); + virtual void run(Mat& image, Mat& mask, std::string& output_text, std::vector* component_rects=NULL, + std::vector* component_texts=NULL, std::vector* component_confidences=NULL, + int component_level=0); + /** @brief Creates an instance of the OCRTesseract class. Initializes Tesseract. @param datapath the name of the parent directory of tessdata ended with "/", or NULL to use the @@ -170,11 +177,11 @@ public: public: /** @brief Recognize text using HMM. - Takes image on input and returns recognized text in the output_text parameter. Optionally + Takes binary image on input and returns recognized text in the output_text parameter. Optionally provides also the Rects for individual text elements found (e.g. words), and the list of those text elements with their confidence values. - @param image Input image CV_8UC1 with a single text line (or word). + @param image Input binary image CV_8UC1 with a single text line (or word). @param output_text Output text. Most likely character sequence found by the HMM decoder. @@ -193,6 +200,33 @@ public: std::vector* component_texts=NULL, std::vector* component_confidences=NULL, int component_level=0); + /** @brief Recognize text using HMM. + + Takes an image and a mask (where each connected component corresponds to a segmented character) + on input and returns recognized text in the output_text parameter. Optionally + provides also the Rects for individual text elements found (e.g. words), and the list of those + text elements with their confidence values. + + @param image Input image CV_8UC1 or CV_8UC3 with a single text line (or word). + @param mask Input binary image CV_8UC1 same size as input image. Each connected component in mask corresponds to a segmented character in the input image. + + @param output_text Output text. Most likely character sequence found by the HMM decoder. + + @param component_rects If provided the method will output a list of Rects for the individual + text elements found (e.g. words). + + @param component_texts If provided the method will output a list of text strings for the + recognition of individual text elements found (e.g. words). + + @param component_confidences If provided the method will output a list of confidence values + for the recognition of individual text elements found (e.g. words). + + @param component_level Only OCR_LEVEL_WORD is supported. + */ + virtual void run(Mat& image, Mat& mask, std::string& output_text, std::vector* component_rects=NULL, + std::vector* component_texts=NULL, std::vector* component_confidences=NULL, + int component_level=0); + /** @brief Creates an instance of the OCRHMMDecoder class. Initializes HMMDecoder. @param classifier The character classifier with built in feature extractor. @@ -231,7 +265,7 @@ protected: @param filename The XML or YAML file with the classifier model (e.g. OCRHMM_knn_model_data.xml) -The default classifier is based in the scene text recognition method proposed by Lukás Neumann & +The KNN default classifier is based in the scene text recognition method proposed by Lukás Neumann & Jiri Matas in [Neumann11b]. Basically, the region (contour) in the input image is normalized to a fixed size, while retaining the centroid and aspect ratio, in order to extract a feature vector based on gradient orientations along the chain-code of its perimeter. Then, the region is classified @@ -240,19 +274,32 @@ types. */ CV_EXPORTS Ptr loadOCRHMMClassifierNM(const std::string& filename); -/** @brief Utility function to create a tailored language model transitions table from a given list of words (lexicon). +/** @brief Allow to implicitly load the default character classifier when creating an OCRHMMDecoder object. -@param vocabulary The language vocabulary (chars when ascii english text). +@param filename The XML or YAML file with the classifier model (e.g. OCRBeamSearch_CNN_model_data.xml.gz) -@param lexicon The list of words that are expected to be found in a particular image. +The CNN default classifier is based in the scene text recognition method proposed by Adam Coates & +Andrew NG in [Coates11a]. The character classifier consists in a Single Layer Convolutional Neural Network and +a linear classifier. It is applied to the input image in a sliding window fashion, providing a set of recognitions +at each window location. + */ +CV_EXPORTS Ptr loadOCRHMMClassifierCNN(const std::string& filename); -@param transition_probabilities_table Output table with transition probabilities between character pairs. cols == rows == vocabulary.size(). +//! @} -The function calculate frequency statistics of character pairs from the given lexicon and fills the output transition_probabilities_table with them. The transition_probabilities_table can be used as input in the OCRHMMDecoder::create() and OCRBeamSearchDecoder::create() methods. -@note - - (C++) An alternative would be to load the default generic language transition table provided in the text module samples folder (created from ispell 42869 english words list) : - - */ +/** @brief Utility function to create a tailored language model transitions table from a given list of words (lexicon). + * + * @param vocabulary The language vocabulary (chars when ascii english text). + * + * @param lexicon The list of words that are expected to be found in a particular image. + * + * @param transition_probabilities_table Output table with transition probabilities between character pairs. cols == rows == vocabulary.size(). + * + * The function calculate frequency statistics of character pairs from the given lexicon and fills the output transition_probabilities_table with them. The transition_probabilities_table can be used as input in the OCRHMMDecoder::create() and OCRBeamSearchDecoder::create() methods. + * @note + * - (C++) An alternative would be to load the default generic language transition table provided in the text module samples folder (created from ispell 42869 english words list) : + * + * */ CV_EXPORTS void createOCRHMMTransitionsTable(std::string& vocabulary, std::vector& lexicon, OutputArray transition_probabilities_table); @@ -319,6 +366,10 @@ public: std::vector* component_texts=NULL, std::vector* component_confidences=NULL, int component_level=0); + virtual void run(Mat& image, Mat& mask, std::string& output_text, std::vector* component_rects=NULL, + std::vector* component_texts=NULL, std::vector* component_confidences=NULL, + int component_level=0); + /** @brief Creates an instance of the OCRBeamSearchDecoder class. Initializes HMMDecoder. @param classifier The character classifier with built in feature extractor. @@ -359,10 +410,10 @@ protected: /** @brief Allow to implicitly load the default character classifier when creating an OCRBeamSearchDecoder object. -@param filename The XML or YAML file with the classifier model (e.g. OCRHMM_knn_model_data.xml) +@param filename The XML or YAML file with the classifier model (e.g. OCRBeamSearch_CNN_model_data.xml.gz) -The default classifier is based in the scene text recognition method proposed by Adam Coates & -Andrew NG in [Coates11a]. The character classifier sonsists in a Single Layer Convolutional Neural Network and +The CNN default classifier is based in the scene text recognition method proposed by Adam Coates & +Andrew NG in [Coates11a]. The character classifier consists in a Single Layer Convolutional Neural Network and a linear classifier. It is applied to the input image in a sliding window fashion, providing a set of recognitions at each window location. */ diff --git a/modules/text/samples/OCRBeamSearch_CNN_model_data.xml.gz b/modules/text/samples/OCRBeamSearch_CNN_model_data.xml.gz index 554052811..048f1d5b7 100644 Binary files a/modules/text/samples/OCRBeamSearch_CNN_model_data.xml.gz and b/modules/text/samples/OCRBeamSearch_CNN_model_data.xml.gz differ diff --git a/modules/text/samples/cropped_word_recognition.cpp b/modules/text/samples/cropped_word_recognition.cpp index 583f4ff01..65ac792f1 100644 --- a/modules/text/samples/cropped_word_recognition.cpp +++ b/modules/text/samples/cropped_word_recognition.cpp @@ -36,7 +36,7 @@ int main(int argc, char* argv[]) return(0); } - string vocabulary = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyx0123456789"; // must have the same order as the clasifier output classes + string vocabulary = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; // must have the same order as the clasifier output classes vector lexicon; // a list of words expected to be found on the input image lexicon.push_back(string("abb")); lexicon.push_back(string("patata")); diff --git a/modules/text/samples/scenetext_segmented_word01.jpg b/modules/text/samples/scenetext_segmented_word01.jpg new file mode 100644 index 000000000..605acd4c6 Binary files /dev/null and b/modules/text/samples/scenetext_segmented_word01.jpg differ diff --git a/modules/text/samples/scenetext_segmented_word01_mask.png b/modules/text/samples/scenetext_segmented_word01_mask.png new file mode 100644 index 000000000..8d488d95c Binary files /dev/null and b/modules/text/samples/scenetext_segmented_word01_mask.png differ diff --git a/modules/text/samples/scenetext_segmented_word02.jpg b/modules/text/samples/scenetext_segmented_word02.jpg new file mode 100644 index 000000000..26f02ac53 Binary files /dev/null and b/modules/text/samples/scenetext_segmented_word02.jpg differ diff --git a/modules/text/samples/scenetext_segmented_word02_mask.png b/modules/text/samples/scenetext_segmented_word02_mask.png new file mode 100644 index 000000000..486286f3a Binary files /dev/null and b/modules/text/samples/scenetext_segmented_word02_mask.png differ diff --git a/modules/text/samples/scenetext_segmented_word03.jpg b/modules/text/samples/scenetext_segmented_word03.jpg new file mode 100644 index 000000000..2b4ab02f1 Binary files /dev/null and b/modules/text/samples/scenetext_segmented_word03.jpg differ diff --git a/modules/text/samples/scenetext_segmented_word03_mask.png b/modules/text/samples/scenetext_segmented_word03_mask.png new file mode 100644 index 000000000..bb378b21d Binary files /dev/null and b/modules/text/samples/scenetext_segmented_word03_mask.png differ diff --git a/modules/text/samples/scenetext_segmented_word04.jpg b/modules/text/samples/scenetext_segmented_word04.jpg new file mode 100644 index 000000000..758c48bfa Binary files /dev/null and b/modules/text/samples/scenetext_segmented_word04.jpg differ diff --git a/modules/text/samples/scenetext_segmented_word04_mask.png b/modules/text/samples/scenetext_segmented_word04_mask.png new file mode 100644 index 000000000..20e7e3f9b Binary files /dev/null and b/modules/text/samples/scenetext_segmented_word04_mask.png differ diff --git a/modules/text/samples/scenetext_segmented_word05.jpg b/modules/text/samples/scenetext_segmented_word05.jpg new file mode 100644 index 000000000..0788acb49 Binary files /dev/null and b/modules/text/samples/scenetext_segmented_word05.jpg differ diff --git a/modules/text/samples/scenetext_segmented_word05_mask.png b/modules/text/samples/scenetext_segmented_word05_mask.png new file mode 100644 index 000000000..29857008e Binary files /dev/null and b/modules/text/samples/scenetext_segmented_word05_mask.png differ diff --git a/modules/text/samples/segmented_word_recognition.cpp b/modules/text/samples/segmented_word_recognition.cpp new file mode 100644 index 000000000..d3b50d24f --- /dev/null +++ b/modules/text/samples/segmented_word_recognition.cpp @@ -0,0 +1,116 @@ +/* + * segmented_word_recognition.cpp + * + * A demo program on segmented word recognition. + * Shows the use of the OCRHMMDecoder API with the two provided default character classifiers. + * + * Created on: Jul 31, 2015 + * Author: Lluis Gomez i Bigorda + */ + +#include "opencv2/text.hpp" +#include "opencv2/core/utility.hpp" +#include "opencv2/highgui.hpp" +#include "opencv2/imgproc.hpp" + +#include + +using namespace std; +using namespace cv; +using namespace text; + + +int main(int argc, char* argv[]) { + + const String keys = + "{help h usage ? | | print this message.}" + "{@image | | source image for recognition.}" + "{@mask | | binary segmentation mask where each contour is a character.}" + "{lexicon lex l | | (optional) lexicon provided as a list of comma separated words.}" + ; + CommandLineParser parser(argc, argv, keys); + + parser.about("\nSegmented word recognition.\nA demo program on segmented word recognition. Shows the use of the OCRHMMDecoder API with the two provided default character classifiers.\n"); + + String filename1 = parser.get(0); + String filename2 = parser.get(1); + + parser.printMessage(); + cout << endl << endl; + if ((parser.has("help")) || (filename1.size()==0)) + { + return 0; + } + if (!parser.check()) + { + parser.printErrors(); + return 0; + } + + Mat image = imread(filename1); + Mat mask; + if (filename2.size() > 0) + mask = imread(filename2); + else + image.copyTo(mask); + + // be sure the mask is a binry image + cvtColor(mask, mask, COLOR_BGR2GRAY); + threshold(mask, mask, 128., 255, THRESH_BINARY); + + // character recognition vocabulary + string voc = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; + // Emission probabilities for the HMM language model (identity matrix by default) + Mat emissionProbabilities = Mat::eye((int)voc.size(), (int)voc.size(), CV_64FC1); + // Bigram transition probabilities for the HMM language model + Mat transitionProbabilities; + + string lex = parser.get("lex"); + if (lex.size()>0) + { + // Build tailored language model for the provided lexicon + vector lexicon; + size_t pos = 0; + string delimiter = ","; + std::string token; + while ((pos = lex.find(delimiter)) != std::string::npos) { + token = lex.substr(0, pos); + lexicon.push_back(token); + lex.erase(0, pos + delimiter.length()); + } + lexicon.push_back(lex); + createOCRHMMTransitionsTable(voc,lexicon,transitionProbabilities); + } else { + // Or load the generic language model (from Aspell English dictionary) + FileStorage fs("./OCRHMM_transitions_table.xml", FileStorage::READ); + fs["transition_probabilities"] >> transitionProbabilities; + fs.release(); + } + + Ptr ocrTes = OCRTesseract::create(); + + Ptr ocrNM = OCRHMMDecoder::create( + loadOCRHMMClassifierNM("./OCRHMM_knn_model_data.xml.gz"), + voc, transitionProbabilities, emissionProbabilities); + + Ptr ocrCNN = OCRHMMDecoder::create( + loadOCRHMMClassifierCNN("OCRBeamSearch_CNN_model_data.xml.gz"), + voc, transitionProbabilities, emissionProbabilities); + + std::string output; + double t_r = (double)getTickCount(); + ocrTes->run(mask, output); + output.erase(remove(output.begin(), output.end(), '\n'), output.end()); + cout << " OCR_Tesseract output \"" << output << "\". Done in " + << ((double)getTickCount() - t_r)*1000/getTickFrequency() << " ms." << endl; + + t_r = (double)getTickCount(); + ocrNM->run(mask, output); + cout << " OCR_NM output \"" << output << "\". Done in " + << ((double)getTickCount() - t_r)*1000/getTickFrequency() << " ms." << endl; + + t_r = (double)getTickCount(); + ocrCNN->run(image, mask, output); + cout << " OCR_CNN output \"" << output << "\". Done in " + << ((double)getTickCount() - t_r)*1000/getTickFrequency() << " ms." << endl; +} diff --git a/modules/text/src/ocr_beamsearch_decoder.cpp b/modules/text/src/ocr_beamsearch_decoder.cpp index b86a6482b..f11546dea 100644 --- a/modules/text/src/ocr_beamsearch_decoder.cpp +++ b/modules/text/src/ocr_beamsearch_decoder.cpp @@ -73,6 +73,22 @@ void OCRBeamSearchDecoder::run(Mat& image, string& output_text, vector* co component_confidences->clear(); } +void OCRBeamSearchDecoder::run(Mat& image, Mat& mask, string& output_text, vector* component_rects, + vector* component_texts, vector* component_confidences, + int component_level) +{ + CV_Assert( (image.type() == CV_8UC1) || (image.type() == CV_8UC3) ); + CV_Assert( mask.type() == CV_8UC1 ); + CV_Assert( (component_level == OCR_LEVEL_TEXTLINE) || (component_level == OCR_LEVEL_WORD) ); + output_text.clear(); + if (component_rects != NULL) + component_rects->clear(); + if (component_texts != NULL) + component_texts->clear(); + if (component_confidences != NULL) + component_confidences->clear(); +} + void OCRBeamSearchDecoder::ClassifierCallback::eval( InputArray image, vector< vector >& recognition_probabilities, vector& oversegmentation) { @@ -136,7 +152,7 @@ public: if (component_confidences != NULL) component_confidences->clear(); - // TODO split a line into words + // TODO We must split a line into words or specify we only work with words if(src.type() == CV_8UC3) { @@ -174,14 +190,7 @@ public: } - //TODO it would be interesting to have a hash table with a vector of booleans - // but this is not possible when we have a large number of possible segmentations. - //vector visited_nodes(pow(2,oversegmentation.size()),false); // hash table for visited nodes - // options are using std::set to store only the keys of visited nodes - // but will deteriorate the time performance. set visited_nodes; //TODO make it member of class - // it is also possible to reduce the number of seg. points in some way (e.g. use only seg.points - // for which there is a change on the class prediction) vector start_segmentation; start_segmentation.push_back(oversegmentation[0]); @@ -221,6 +230,21 @@ public: return; } + void run( Mat& src, + Mat& mask, + string& out_sequence, + vector* component_rects, + vector* component_texts, + vector* component_confidences, + int component_level) + { + + CV_Assert( mask.type() == CV_8UC1 ); + + // Nothing to do with a mask here. We do slidding window anyway. + run( src, out_sequence, component_rects, component_texts, component_confidences, component_level ); + } + private: //////////////////////////////////////////////////////////// @@ -421,22 +445,29 @@ OCRBeamSearchClassifierCNN::OCRBeamSearchClassifierCNN (const string& filename) fs["feature_min"] >> feature_min; fs["feature_max"] >> feature_max; fs.release(); - // TODO check all matrix dimensions match correctly and no one is empty } else CV_Error(Error::StsBadArg, "Default classifier data file not found!"); - nr_feature = weights.rows; - nr_class = weights.cols; - // TODO some of this can be inferred from the input file (e.g. patch size must be sqrt(filters.cols)) - step_size = 4; + // check all matrix dimensions match correctly and no one is empty + CV_Assert( (M.cols > 0) && (M.rows > 0) ); + CV_Assert( (P.cols > 0) && (P.rows > 0) ); + CV_Assert( (kernels.cols > 0) && (kernels.rows > 0) ); + CV_Assert( (weights.cols > 0) && (weights.rows > 0) ); + CV_Assert( (feature_min.cols > 0) && (feature_min.rows > 0) ); + CV_Assert( (feature_max.cols > 0) && (feature_max.rows > 0) ); + + nr_feature = weights.rows; + nr_class = weights.cols; + patch_size = (int)sqrt(kernels.cols); + // algorithm internal parameters window_size = 32; quad_size = 12; - patch_size = 8; num_quads = 25; num_tiles = 25; alpha = 0.5; + step_size = 4; // TODO showld this be a parameter for the user? } @@ -459,7 +490,6 @@ void OCRBeamSearchClassifierCNN::eval( InputArray _src, vector< vector > cvtColor(src,src,COLOR_RGB2GRAY); } - // TODO shall we resize the input image or make a copy ? resize(src,src,Size(window_size*src.cols/src.rows,window_size)); int seg_points = 0; @@ -555,8 +585,8 @@ void OCRBeamSearchClassifierCNN::eval( InputArray _src, vector< vector > double *p = new double[nr_class]; double predict_label = eval_feature(feature,p); //cout << " Prediction: " << vocabulary[predict_label] << " with probability " << p[0] << endl; - if (predict_label < 0) // TODO use cvError - cout << "OCRBeamSearchClassifierCNN::eval Error: unexpected prediction in eval_feature()" << endl; + if (predict_label < 0) + CV_Error(Error::StsInternal, "OCRBeamSearchClassifierCNN::eval Error: unexpected prediction in eval_feature()"); seg_points++; diff --git a/modules/text/src/ocr_hmm_decoder.cpp b/modules/text/src/ocr_hmm_decoder.cpp index 69b431f2d..8a0a74b9d 100644 --- a/modules/text/src/ocr_hmm_decoder.cpp +++ b/modules/text/src/ocr_hmm_decoder.cpp @@ -74,6 +74,22 @@ void OCRHMMDecoder::run(Mat& image, string& output_text, vector* component component_confidences->clear(); } +void OCRHMMDecoder::run(Mat& image, Mat& mask, string& output_text, vector* component_rects, + vector* component_texts, vector* component_confidences, + int component_level) +{ + CV_Assert( (image.type() == CV_8UC1) || (image.type() == CV_8UC3) ); + CV_Assert( mask.type() == CV_8UC1 ); + CV_Assert( (component_level == OCR_LEVEL_TEXTLINE) || (component_level == OCR_LEVEL_WORD) ); + output_text.clear(); + if (component_rects != NULL) + component_rects->clear(); + if (component_texts != NULL) + component_texts->clear(); + if (component_confidences != NULL) + component_confidences->clear(); +} + void OCRHMMDecoder::ClassifierCallback::eval( InputArray image, vector& out_class, vector& out_confidence) { CV_Assert(( image.getMat().type() == CV_8UC3 ) || ( image.getMat().type() == CV_8UC1 )); @@ -263,6 +279,264 @@ public: obs.push_back(out_class[0]); observations.push_back(out_class); confidences.push_back(out_conf); + //cout << " out class = " << vocabulary[out_class[0]] << endl; + } + + + //This must be extracted from dictionary, or just assumed to be equal for all characters + vector start_p(vocabulary.size()); + for (int i=0; i<(int)vocabulary.size(); i++) + start_p[i] = 1.0/vocabulary.size(); + + + Mat V = Mat::zeros((int)observations.size(),(int)vocabulary.size(),CV_64FC1); + vector path(vocabulary.size()); + + // Initialize base cases (t == 0) + for (int i=0; i<(int)vocabulary.size(); i++) + { + for (int j=0; j<(int)observations[0].size(); j++) + { + emission_p.at(observations[0][j],obs[0]) = confidences[0][j]; + } + V.at(0,i) = start_p[i] * emission_p.at(i,obs[0]); + path[i] = vocabulary.at(i); + } + + + // Run Viterbi for t > 0 + for (int t=1; t<(int)obs.size(); t++) + { + + //Dude this has to be done each time!! + emission_p = Mat::eye(62,62,CV_64FC1); + for (int e=0; e<(int)observations[t].size(); e++) + { + emission_p.at(observations[t][e],obs[t]) = confidences[t][e]; + } + + vector newpath(vocabulary.size()); + + for (int i=0; i<(int)vocabulary.size(); i++) + { + double max_prob = 0; + int best_idx = 0; + for (int j=0; j<(int)vocabulary.size(); j++) + { + double prob = V.at(t-1,j) * transition_p.at(j,i) * emission_p.at(i,obs[t]); + if ( prob > max_prob) + { + max_prob = prob; + best_idx = j; + } + } + + V.at(t,i) = max_prob; + newpath[i] = path[best_idx] + vocabulary.at(i); + } + + // Don't need to remember the old paths + path.swap(newpath); + } + + double max_prob = 0; + int best_idx = 0; + for (int i=0; i<(int)vocabulary.size(); i++) + { + double prob = V.at((int)obs.size()-1,i); + if ( prob > max_prob) + { + max_prob = prob; + best_idx = i; + } + } + + //cout << path[best_idx] << endl; + if (out_sequence.size()>0) out_sequence = out_sequence+" "+path[best_idx]; + else out_sequence = path[best_idx]; + + if (component_rects != NULL) + component_rects->push_back(words_rect[w]); + if (component_texts != NULL) + component_texts->push_back(path[best_idx]); + if (component_confidences != NULL) + component_confidences->push_back((float)max_prob); + + } + + return; + } + + void run( Mat& image, + Mat& mask, + string& out_sequence, + vector* component_rects, + vector* component_texts, + vector* component_confidences, + int component_level) + { + + CV_Assert( (image.type() == CV_8UC1) || (image.type() == CV_8UC3) ); + CV_Assert( mask.type() == CV_8UC1 ); + CV_Assert( (image.cols > 0) && (image.rows > 0) ); + CV_Assert( (image.cols == mask.cols) && (image.rows == mask.rows) ); + CV_Assert( component_level == OCR_LEVEL_WORD ); + + out_sequence.clear(); + if (component_rects != NULL) + component_rects->clear(); + if (component_texts != NULL) + component_texts->clear(); + if (component_confidences != NULL) + component_confidences->clear(); + + // First we split a line into words + vector words_mask; + vector words_rect; + + /// Find contours + vector > contours; + vector hierarchy; + Mat tmp; + mask.copyTo(tmp); + findContours( tmp, contours, hierarchy, RETR_EXTERNAL, CHAIN_APPROX_SIMPLE, Point(0, 0) ); + if (contours.size() < 6) + { + //do not split lines with less than 6 characters + words_mask.push_back(mask); + words_rect.push_back(Rect(0,0,mask.cols,mask.rows)); + } + else + { + + Mat_ vector_w((int)mask.cols,1); + reduce(mask, vector_w, 0, REDUCE_SUM, -1); + + vector spaces; + vector spaces_start; + vector spaces_end; + int space_count=0; + int last_one_idx; + + int s_init = 0, s_end=vector_w.cols; + for (int s=0; s(0,s) == 0) + s_init = s+1; + else + break; + } + for (int s=vector_w.cols-1; s>=0; s--) + { + if (vector_w.at(0,s) == 0) + s_end = s; + else + break; + } + + for (int s=s_init; s(0,s) == 0) + { + space_count++; + } else { + if (space_count!=0) + { + spaces.push_back(space_count); + spaces_start.push_back(last_one_idx); + spaces_end.push_back(s-1); + } + space_count = 0; + last_one_idx = s; + } + } + Scalar mean_space,std_space; + meanStdDev(Mat(spaces),mean_space,std_space); + int num_word_spaces = 0; + int last_word_space_end = 0; + for (int s=0; s<(int)spaces.size(); s++) + { + if (spaces_end.at(s)-spaces_start.at(s) > mean_space[0]+(mean_space[0]*1.1)) //this 1.1 is a param? + { + if (num_word_spaces == 0) + { + //cout << " we have a word from 0 to " << spaces_start.at(s) << endl; + Mat word_mask; + Rect word_rect = Rect(0,0,spaces_start.at(s),mask.rows); + mask(word_rect).copyTo(word_mask); + + words_mask.push_back(word_mask); + words_rect.push_back(word_rect); + } + else + { + //cout << " we have a word from " << last_word_space_end << " to " << spaces_start.at(s) << endl; + Mat word_mask; + Rect word_rect = Rect(last_word_space_end,0,spaces_start.at(s)-last_word_space_end,mask.rows); + mask(word_rect).copyTo(word_mask); + + words_mask.push_back(word_mask); + words_rect.push_back(word_rect); + } + num_word_spaces++; + last_word_space_end = spaces_end.at(s); + } + } + //cout << " we have a word from " << last_word_space_end << " to " << vector_w.cols << endl << endl << endl; + Mat word_mask; + Rect word_rect = Rect(last_word_space_end,0,vector_w.cols-last_word_space_end,mask.rows); + mask(word_rect).copyTo(word_mask); + + words_mask.push_back(word_mask); + words_rect.push_back(word_rect); + + } + + for (int w=0; w<(int)words_mask.size(); w++) + { + + vector< vector > observations; + vector< vector > confidences; + vector obs; + // First find contours and sort by x coordinate of bbox + words_mask[w].copyTo(tmp); + if (tmp.empty()) + continue; + contours.clear(); + hierarchy.clear(); + /// Find contours + findContours( tmp, contours, hierarchy, RETR_EXTERNAL, CHAIN_APPROX_SIMPLE, Point(0, 0) ); + vector contours_rect; + for (int i=0; i<(int)contours.size(); i++) + { + contours_rect.push_back(boundingRect(contours[i])); + } + + sort(contours_rect.begin(), contours_rect.end(), sort_rect_horiz); + + // Do character recognition foreach contour + for (int i=0; i<(int)contours.size(); i++) + { + vector out_class; + vector out_conf; + //take the center of the char rect and translate it to the real origin + Point char_center = Point(contours_rect.at(i).x+contours_rect.at(i).width/2, + contours_rect.at(i).y+contours_rect.at(i).height/2); + char_center.x += words_rect[w].x; + char_center.y += words_rect[w].y; + int win_size = max(contours_rect.at(i).width,contours_rect.at(i).height); + win_size += (int)(win_size*0.6); // add some pixels in the border TODO: is this a parameter for the user space? + Rect char_rect = Rect(char_center.x-win_size/2,char_center.y-win_size/2,win_size,win_size); + char_rect &= Rect(0,0,image.cols,image.rows); + Mat tmp_image; + image(char_rect).copyTo(tmp_image); + + classifier->eval(tmp_image,out_class,out_conf); + if (!out_class.empty()) + obs.push_back(out_class[0]); + //cout << " out class = " << vocabulary[out_class[0]] << "(" << out_conf[0] << ")" << endl; + observations.push_back(out_class); + confidences.push_back(out_conf); } @@ -335,7 +609,8 @@ public: } //cout << path[best_idx] << endl; - out_sequence = out_sequence+" "+path[best_idx]; + if (out_sequence.size()>0) out_sequence = out_sequence+" "+path[best_idx]; + else out_sequence = path[best_idx]; if (component_rects != NULL) component_rects->push_back(words_rect[w]); @@ -598,6 +873,278 @@ Ptr loadOCRHMMClassifierNM(const std::string& return makePtr(filename); } + + +class CV_EXPORTS OCRHMMClassifierCNN : public OCRHMMDecoder::ClassifierCallback +{ +public: + //constructor + OCRHMMClassifierCNN(const std::string& filename); + // Destructor + ~OCRHMMClassifierCNN() {} + + void eval( InputArray image, vector& out_class, vector& out_confidence ); + +protected: + void normalizeAndZCA(Mat& patches); + double eval_feature(Mat& feature, double* prob_estimates); + +private: + int nr_class; // number of classes + int nr_feature; // number of features + Mat feature_min; // scale range + Mat feature_max; + Mat weights; // Logistic Regression weights + Mat kernels; // CNN kernels + Mat M, P; // ZCA Whitening parameters + int window_size; // window size + int quad_size; + int patch_size; + int num_quads; // extract 25 quads (12x12) from each image + int num_tiles; // extract 25 patches (8x8) from each quad + double alpha; // used in non-linear activation function z = max(0, |D*a| - alpha) +}; + +OCRHMMClassifierCNN::OCRHMMClassifierCNN (const string& filename) +{ + if (ifstream(filename.c_str())) + { + FileStorage fs(filename, FileStorage::READ); + // Load kernels bank and withenning params + fs["kernels"] >> kernels; + fs["M"] >> M; + fs["P"] >> P; + // Load Logistic Regression weights + fs["weights"] >> weights; + // Load feature scaling ranges + fs["feature_min"] >> feature_min; + fs["feature_max"] >> feature_max; + fs.release(); + } + else + CV_Error(Error::StsBadArg, "Default classifier data file not found!"); + + // check all matrix dimensions match correctly and no one is empty + CV_Assert( (M.cols > 0) && (M.rows > 0) ); + CV_Assert( (P.cols > 0) && (P.rows > 0) ); + CV_Assert( (kernels.cols > 0) && (kernels.rows > 0) ); + CV_Assert( (weights.cols > 0) && (weights.rows > 0) ); + CV_Assert( (feature_min.cols > 0) && (feature_min.rows > 0) ); + CV_Assert( (feature_max.cols > 0) && (feature_max.rows > 0) ); + + nr_feature = weights.rows; + nr_class = weights.cols; + patch_size = (int)sqrt(kernels.cols); + // algorithm internal parameters + window_size = 32; + num_quads = 25; + num_tiles = 25; + quad_size = 12; + alpha = 0.5; +} + +void OCRHMMClassifierCNN::eval( InputArray _src, vector& out_class, vector& out_confidence ) +{ + + CV_Assert(( _src.getMat().type() == CV_8UC3 ) || ( _src.getMat().type() == CV_8UC1 )); + + out_class.clear(); + out_confidence.clear(); + + + Mat img = _src.getMat(); + if(img.type() == CV_8UC3) + { + cvtColor(img,img,COLOR_RGB2GRAY); + } + + // shall we resize the input image or make a copy ? + resize(img,img,Size(window_size,window_size)); + + Mat quad; + Mat tmp; + + int patch_count = 0; + vector< vector > data_pool(9); + + + int quad_id = 1; + for (int q_x=0; q_x<=window_size-quad_size; q_x=q_x+(int)(quad_size/2-1)) + { + for (int q_y=0; q_y<=window_size-quad_size; q_y=q_y+(int)(quad_size/2-1)) + { + Rect quad_rect = Rect(q_x,q_y,quad_size,quad_size); + quad = img(quad_rect); + + //start sliding window (8x8) in each tile and store the patch as row in data_pool + for (int w_x=0; w_x<=quad_size-patch_size; w_x++) + { + for (int w_y=0; w_y<=quad_size-patch_size; w_y++) + { + quad(Rect(w_x,w_y,patch_size,patch_size)).copyTo(tmp); + tmp = tmp.reshape(0,1); + tmp.convertTo(tmp, CV_64F); + normalizeAndZCA(tmp); + vector patch; + tmp.copyTo(patch); + if ((quad_id == 1)||(quad_id == 2)||(quad_id == 6)||(quad_id == 7)) + data_pool[0].insert(data_pool[0].end(),patch.begin(),patch.end()); + if ((quad_id == 2)||(quad_id == 7)||(quad_id == 3)||(quad_id == 8)||(quad_id == 4)||(quad_id == 9)) + data_pool[1].insert(data_pool[1].end(),patch.begin(),patch.end()); + if ((quad_id == 4)||(quad_id == 9)||(quad_id == 5)||(quad_id == 10)) + data_pool[2].insert(data_pool[2].end(),patch.begin(),patch.end()); + if ((quad_id == 6)||(quad_id == 11)||(quad_id == 16)||(quad_id == 7)||(quad_id == 12)||(quad_id == 17)) + data_pool[3].insert(data_pool[3].end(),patch.begin(),patch.end()); + if ((quad_id == 7)||(quad_id == 12)||(quad_id == 17)||(quad_id == 8)||(quad_id == 13)||(quad_id == 18)||(quad_id == 9)||(quad_id == 14)||(quad_id == 19)) + data_pool[4].insert(data_pool[4].end(),patch.begin(),patch.end()); + if ((quad_id == 9)||(quad_id == 14)||(quad_id == 19)||(quad_id == 10)||(quad_id == 15)||(quad_id == 20)) + data_pool[5].insert(data_pool[5].end(),patch.begin(),patch.end()); + if ((quad_id == 16)||(quad_id == 21)||(quad_id == 17)||(quad_id == 22)) + data_pool[6].insert(data_pool[6].end(),patch.begin(),patch.end()); + if ((quad_id == 17)||(quad_id == 22)||(quad_id == 18)||(quad_id == 23)||(quad_id == 19)||(quad_id == 24)) + data_pool[7].insert(data_pool[7].end(),patch.begin(),patch.end()); + if ((quad_id == 19)||(quad_id == 24)||(quad_id == 20)||(quad_id == 25)) + data_pool[8].insert(data_pool[8].end(),patch.begin(),patch.end()); + patch_count++; + } + } + + quad_id++; + } + } + + //do dot product of each normalized and whitened patch + //each pool is averaged and this yields a representation of 9xD + Mat feature = Mat::zeros(9,kernels.rows,CV_64FC1); + for (int i=0; i<9; i++) + { + Mat pool = Mat(data_pool[i]); + pool = pool.reshape(0,(int)data_pool[i].size()/kernels.cols); + for (int p=0; p(0,f) = feature.row(i).at(0,f) + max(0.0,std::abs(pool.row(p).dot(kernels.row(f)))-alpha); + } + } + } + feature = feature.reshape(0,1); + + + // data must be normalized within the range obtained during training + double lower = -1.0; + double upper = 1.0; + for (int k=0; k(0,k) = lower + (upper-lower) * + (feature.at(0,k)-feature_min.at(0,k))/ + (feature_max.at(0,k)-feature_min.at(0,k)); + } + + double *p = new double[nr_class]; + double predict_label = eval_feature(feature,p); + //cout << " Prediction: " << vocabulary[predict_label] << " with probability " << p[0] << endl; + if (predict_label < 0) + CV_Error(Error::StsInternal, "OCRHMMClassifierCNN::eval Error: unexpected prediction in eval_feature()"); + + out_class.push_back((int)predict_label); + out_confidence.push_back(p[(int)predict_label]); + + for (int i = 0; i(0,e_val.rows-i-1); + } + + P = V * D * V.t(); + } + + for (int i=0; i(idx,i)*feature.at(0,idx); //TODO use vectorized dot product + + int dec_max_idx = 0; + for(int i=1;i prob_estimates[dec_max_idx]) + dec_max_idx = i; + } + + for(int i=0;i loadOCRHMMClassifierCNN(const std::string& filename) + +{ + return makePtr(filename); +} + /** @brief Utility function to create a tailored language model transitions table from a given list of words (lexicon). @param vocabulary The language vocabulary (chars when ascii english text). diff --git a/modules/text/src/ocr_tesseract.cpp b/modules/text/src/ocr_tesseract.cpp index 640a7e3b9..79695f0d7 100644 --- a/modules/text/src/ocr_tesseract.cpp +++ b/modules/text/src/ocr_tesseract.cpp @@ -70,6 +70,22 @@ void OCRTesseract::run(Mat& image, string& output_text, vector* component_ component_confidences->clear(); } +void OCRTesseract::run(Mat& image, Mat& mask, string& output_text, vector* component_rects, + vector* component_texts, vector* component_confidences, + int component_level) +{ + CV_Assert( (image.type() == CV_8UC1) || (image.type() == CV_8UC3) ); + CV_Assert( mask.type() == CV_8UC1 ); + CV_Assert( (component_level == OCR_LEVEL_TEXTLINE) || (component_level == OCR_LEVEL_WORD) ); + output_text.clear(); + if (component_rects != NULL) + component_rects->clear(); + if (component_texts != NULL) + component_texts->clear(); + if (component_confidences != NULL) + component_confidences->clear(); +} + class OCRTesseractImpl : public OCRTesseract { private: @@ -189,6 +205,16 @@ public: #endif } + void run(Mat& image, Mat& mask, string& output, vector* component_rects=NULL, + vector* component_texts=NULL, vector* component_confidences=NULL, + int component_level=0) + { + CV_Assert( mask.type() == CV_8UC1 ); + CV_Assert( (image.type() == CV_8UC1) || (image.type() == CV_8UC3) ); + + run( mask, output, component_rects, component_texts, component_confidences, component_level); + } + };