diff --git a/modules/text/include/opencv2/text/ocr.hpp b/modules/text/include/opencv2/text/ocr.hpp index 0155360de..7d8149672 100644 --- a/modules/text/include/opencv2/text/ocr.hpp +++ b/modules/text/include/opencv2/text/ocr.hpp @@ -338,6 +338,9 @@ public: including 0 as start-sequence location. */ virtual void eval( InputArray image, std::vector< std::vector >& recognition_probabilities, std::vector& oversegmentation ); + + int getWindowSize() {return 0;} + int getStepSize() {return 0;} }; public: @@ -396,7 +399,7 @@ public: InputArray emission_probabilities_table, // Table with observation emission probabilities // cols == rows == vocabulari.size() decoder_mode mode = OCR_DECODER_VITERBI, // HMM Decoding algorithm (only Viterbi for the moment) - int beam_size = 50); // Size of the beam in Beam Search algorithm + int beam_size = 500); // Size of the beam in Beam Search algorithm protected: diff --git a/modules/text/samples/cropped_word_recognition.cpp b/modules/text/samples/cropped_word_recognition.cpp index 65ac792f1..32e3570e5 100644 --- a/modules/text/samples/cropped_word_recognition.cpp +++ b/modules/text/samples/cropped_word_recognition.cpp @@ -39,12 +39,13 @@ int main(int argc, char* argv[]) string vocabulary = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; // must have the same order as the clasifier output classes vector lexicon; // a list of words expected to be found on the input image lexicon.push_back(string("abb")); - lexicon.push_back(string("patata")); + lexicon.push_back(string("riser")); lexicon.push_back(string("CHINA")); lexicon.push_back(string("HERE")); lexicon.push_back(string("President")); lexicon.push_back(string("smash")); lexicon.push_back(string("KUALA")); + lexicon.push_back(string("Produkt")); lexicon.push_back(string("NINTENDO")); // Create tailored language model a small given lexicon @@ -54,16 +55,18 @@ int main(int argc, char* argv[]) // An alternative would be to load the default generic language model // (created from ispell 42869 english words list) /*Mat transition_p; - string filename = "OCRHMM_transitions_table.xml"; // TODO use same order for voc + string filename = "OCRHMM_transitions_table.xml"; FileStorage fs(filename, FileStorage::READ); fs["transition_probabilities"] >> transition_p; fs.release();*/ Mat emission_p = Mat::eye(62,62,CV_64FC1); + // Notice we set here a beam size of 50. This is much faster than using the default value (500). + // 50 works well with our tiny lexicon example, but may not with larger dictionaries. Ptr ocr = OCRBeamSearchDecoder::create( loadOCRBeamSearchClassifierCNN("OCRBeamSearch_CNN_model_data.xml.gz"), - vocabulary, transition_p, emission_p); + vocabulary, transition_p, emission_p, OCR_DECODER_VITERBI, 50); double t_r = (double)getTickCount(); string output; diff --git a/modules/text/samples/scenetext_word03.jpg b/modules/text/samples/scenetext_word03.jpg new file mode 100644 index 000000000..7d1753c60 Binary files /dev/null and b/modules/text/samples/scenetext_word03.jpg differ diff --git a/modules/text/samples/scenetext_word04.jpg b/modules/text/samples/scenetext_word04.jpg new file mode 100644 index 000000000..70d45c070 Binary files /dev/null and b/modules/text/samples/scenetext_word04.jpg differ diff --git a/modules/text/src/ocr_beamsearch_decoder.cpp b/modules/text/src/ocr_beamsearch_decoder.cpp index f11546dea..0a87285f1 100644 --- a/modules/text/src/ocr_beamsearch_decoder.cpp +++ b/modules/text/src/ocr_beamsearch_decoder.cpp @@ -72,13 +72,12 @@ void OCRBeamSearchDecoder::run(Mat& image, string& output_text, vector* co if (component_confidences != NULL) component_confidences->clear(); } - void OCRBeamSearchDecoder::run(Mat& image, Mat& mask, string& output_text, vector* component_rects, vector* component_texts, vector* component_confidences, int component_level) { + CV_Assert(mask.type() == CV_8UC1); CV_Assert( (image.type() == CV_8UC1) || (image.type() == CV_8UC3) ); - CV_Assert( mask.type() == CV_8UC1 ); CV_Assert( (component_level == OCR_LEVEL_TEXTLINE) || (component_level == OCR_LEVEL_WORD) ); output_text.clear(); if (component_rects != NULL) @@ -102,11 +101,18 @@ void OCRBeamSearchDecoder::ClassifierCallback::eval( InputArray image, vector< v oversegmentation.clear(); } +struct beamSearch_node { + double score; + vector segmentation; + bool expanded; + // TODO calculating score of its childs would be much faster if we store the last column + // of their "root" path. +}; -bool beam_sort_function ( pair< double,vector > i, pair< double,vector > j ); -bool beam_sort_function ( pair< double,vector > i, pair< double,vector > j ) +bool beam_sort_function ( beamSearch_node a, beamSearch_node b ); +bool beam_sort_function ( beamSearch_node a, beamSearch_node b ) { - return (i.first > j.first); + return (a.score > b.score); } @@ -122,17 +128,43 @@ public: int _beam_size) { classifier = _classifier; - transition_p = transition_probabilities_table.getMat(); + step_size = classifier->getStepSize(); + win_size = classifier->getWindowSize(); emission_p = emission_probabilities_table.getMat(); vocabulary = _vocabulary; mode = _mode; beam_size = _beam_size; + transition_probabilities_table.getMat().copyTo(transition_p); + for (int i=0; i(i,j) == 0) + transition_p.at(i,j) = -DBL_MAX; + else + transition_p.at(i,j) = log(transition_p.at(i,j)); + } + } } ~OCRBeamSearchDecoderImpl() { } + void run( Mat& src, + Mat& mask, + string& out_sequence, + vector* component_rects, + vector* component_texts, + vector* component_confidences, + int component_level) + { + CV_Assert(mask.type() == CV_8UC1); + //nothing to do with a mask here + run( src, out_sequence, component_rects, component_texts, component_confidences, + component_level); + } + void run( Mat& src, string& out_sequence, vector* component_rects, @@ -152,20 +184,62 @@ public: if (component_confidences != NULL) component_confidences->clear(); - // TODO We must split a line into words or specify we only work with words - if(src.type() == CV_8UC3) { cvtColor(src,src,COLOR_RGB2GRAY); } - vector< vector > recognition_probabilities; - vector oversegmentation; + // TODO if input is a text line (not a word) we may need to split into words here! + // do sliding window classification along a croped word image classifier->eval(src, recognition_probabilities, oversegmentation); - /*Now we go here with the beam search algorithm to optimize the recognition score*/ + // if the number of oversegmentation points found is less than 2 we can not do nothing!! + if (oversegmentation.size() < 2) return; + + + //NMS of recognitions + double last_best_p = 0; + int last_best_idx = -1; + for (size_t i=0; i best_p) + { + best_p = recognition_probabilities[i][j]; + best_idx = (int)j; + } + } + + if ((i>0) && (best_idx == last_best_idx) + && (oversegmentation[i]*step_size < oversegmentation[i-1]*step_size + win_size) ) + { + if (last_best_p > best_p) + { + //remove i'th elements and do not increment i + recognition_probabilities.erase (recognition_probabilities.begin()+i); + oversegmentation.erase (oversegmentation.begin()+i); + continue; + } else { + //remove (i-1)'th elements and do not increment i + recognition_probabilities.erase (recognition_probabilities.begin()+i-1); + oversegmentation.erase (oversegmentation.begin()+i-1); + last_best_idx = best_idx; + last_best_p = best_p; + continue; + } + } + + last_best_idx = best_idx; + last_best_p = best_p; + i++; + } + + /*Now we go with the beam search algorithm to optimize the recognition score*/ //convert probabilities to log probabilities for (size_t i=0; i(i,j) == 0) - transition_p.at(i,j) = -DBL_MAX; - else - transition_p.at(i,j) = log(transition_p.at(i,j)); - } - } + for (size_t j=i+1; j > childs = generate_childs( node.segmentation ); + node.expanded = true; - set visited_nodes; //TODO make it member of class + beam.push_back( node ); - vector start_segmentation; - start_segmentation.push_back(oversegmentation[0]); - start_segmentation.push_back(oversegmentation[oversegmentation.size()-1]); + if (!childs.empty()) + update_beam( childs ); - vector< pair< double,vector > > beam; - beam.push_back( pair< double,vector > (score_segmentation(start_segmentation, recognition_probabilities, out_sequence), start_segmentation) ); + generated_chids += (int)childs.size(); - vector< vector > childs = generate_childs(start_segmentation,oversegmentation, visited_nodes); - if (!childs.empty()) - update_beam( beam, childs, recognition_probabilities); - //cout << "beam size " << beam.size() << " best score " << beam[0].first<< endl; + } + } - int generated_chids = (int)childs.size(); while (generated_chids != 0) { generated_chids = 0; - vector< pair< double,vector > > old_beam = beam; - for (size_t i=0; i > childs; + if (!beam[i].expanded) + { + childs = generate_childs( beam[i].segmentation ); + beam[i].expanded = true; + } if (!childs.empty()) - update_beam( beam, childs, recognition_probabilities); + update_beam( childs ); generated_chids += (int)childs.size(); } - //cout << "beam size " << beam.size() << " best score " << beam[0].first << endl; } + // Done! Get the best prediction found into out_sequence + double lp = score_segmentation( beam[0].segmentation, out_sequence ); - // FINISHED ! Get the best prediction found into out_sequence - score_segmentation(beam[0].second, recognition_probabilities, out_sequence); - - - // TODO fill other output parameters + // fill other (dummy) output parameters + component_rects->push_back(Rect(0,0,src.cols,src.rows)); + component_texts->push_back(out_sequence); + component_confidences->push_back((float)exp(lp)); return; } - void run( Mat& src, - Mat& mask, - string& out_sequence, - vector* component_rects, - vector* component_texts, - vector* component_confidences, - int component_level) - { - - CV_Assert( mask.type() == CV_8UC1 ); - - // Nothing to do with a mask here. We do slidding window anyway. - run( src, out_sequence, component_rects, component_texts, component_confidences, component_level ); - } - private: + int win_size; + int step_size; - //////////////////////////////////////////////////////////// + vector< beamSearch_node > beam; + vector< vector > recognition_probabilities; + vector oversegmentation; - // TODO the way we expand nodes makes the recognition score heuristic not monotonic - // it should start from left node 0 and grow always to the right. - - vector< vector > generate_childs(vector &segmentation, vector &oversegmentation, set &visited_nodes) + vector< vector > generate_childs( vector &segmentation ) { - /*cout << " generate childs for ["; - for (size_t i = 0 ; i < segmentation .size(); i++) - cout << segmentation[i] << ","; - cout << "] ";*/ vector< vector > childs; - for (size_t i=0; i child = segmentation; child.push_back(seg_point); - sort(child.begin(), child.end()); - unsigned long long int key = 0; - for (size_t j=0; j > > &beam, vector< vector > &childs, vector< vector > &recognition_probabilities) + void update_beam ( vector< vector > &childs ) { string out_sequence; double min_score = -DBL_MAX; //min score value to be part of the beam - if ((int)beam.size() == beam_size) - min_score = beam[beam.size()-1].first; //last element has the lowest score + if ((int)beam.size() >= beam_size) + min_score = beam[beam_size-1].score; //last element has the lowest score + for (size_t i=0; i min_score) { - beam.push_back(pair< double,vector >(score,childs[i])); + beamSearch_node node; + node.score = score; + node.segmentation = childs[i]; + node.expanded = false; + beam.push_back(node); sort(beam.begin(),beam.end(),beam_sort_function); if ((int)beam.size() > beam_size) { - beam.pop_back(); - min_score = beam[beam.size()-1].first; + beam.erase(beam.begin()+beam_size,beam.end()); + min_score = beam[beam.size()-1].score; } } } } - //////////////////////////////////////////////////////////// - // TODO Add heuristics to the score function (see PhotoOCR paper) - // e.g.: in some cases we discard a segmentation because it includes a very large character - // in other cases we do it because the overlapping between two chars is too large - // etc. - double score_segmentation(vector &segmentation, vector< vector > &observations, string& outstring) + double score_segmentation( vector &segmentation, string& outstring ) { - //TODO This must be extracted from dictionary + // Score Heuristics: + // No need to use Viterbi to know a given segmentation is bad + // e.g.: in some cases we discard a segmentation because it includes a very large character + // in other cases we do it because the overlapping between two chars is too large + // TODO Add more heuristics (e.g. penalize large inter-character variance) + + Mat interdist ((int)segmentation.size()-1, 1, CV_32F, 1); + for (size_t i=0; i((int)i,0) = (float)oversegmentation[segmentation[(int)i+1]]*step_size + - (float)oversegmentation[segmentation[(int)i]]*step_size; + if ((float)interdist.at((int)i,0)/win_size > 2.25) // TODO explain how did you set this thrs + { + return -DBL_MAX; + } + if ((float)interdist.at((int)i,0)/win_size < 0.15) // TODO explain how did you set this thrs + { + return -DBL_MAX; + } + } + Scalar m, std; + meanStdDev(interdist, m, std); + //double interdist_std = std[0]; + + //TODO Extracting start probs from lexicon (if we have it) may boost accuracy! vector start_p(vocabulary.size()); for (int i=0; i<(int)vocabulary.size(); i++) start_p[i] = log(1.0/vocabulary.size()); - Mat V = Mat::ones((int)segmentation.size()-1,(int)vocabulary.size(),CV_64FC1); + Mat V = Mat::ones((int)segmentation.size(),(int)vocabulary.size(),CV_64FC1); V = V * -DBL_MAX; vector path(vocabulary.size()); // Initialize base cases (t == 0) for (int i=0; i<(int)vocabulary.size(); i++) { - V.at(0,i) = start_p[i] + observations[segmentation[1]-1][i]; + V.at(0,i) = start_p[i] + recognition_probabilities[segmentation[0]][i]; path[i] = vocabulary.at(i); } // Run Viterbi for t > 0 - for (int t=1; t<(int)segmentation.size()-1; t++) + for (int t=1; t<(int)segmentation.size(); t++) { vector newpath(vocabulary.size()); @@ -352,7 +416,7 @@ private: int best_idx = 0; for (int j=0; j<(int)vocabulary.size(); j++) { - double prob = V.at(t-1,j) + transition_p.at(j,i) + observations[segmentation[t+1]-1][i]; + double prob = V.at(t-1,j) + transition_p.at(j,i) + recognition_probabilities[segmentation[t]][i]; if ( prob > max_prob) { max_prob = prob; @@ -372,7 +436,7 @@ private: int best_idx = 0; for (int i=0; i<(int)vocabulary.size(); i++) { - double prob = V.at((int)segmentation.size()-2,i); + double prob = V.at((int)segmentation.size()-1,i); if ( prob > max_prob) { max_prob = prob; @@ -380,9 +444,8 @@ private: } } - //cout << " score " << max_prob / (segmentation.size()-1) << " " << path[best_idx] << endl; outstring = path[best_idx]; - return max_prob / (segmentation.size()-1); + return (max_prob / (segmentation.size()-1)); } }; @@ -408,21 +471,24 @@ public: void eval( InputArray src, vector< vector >& recognition_probabilities, vector& oversegmentation ); + int getWindowSize() {return window_size;} + int getStepSize() {return step_size;} + void setStepSize(int _step_size) {step_size = _step_size;} + protected: void normalizeAndZCA(Mat& patches); double eval_feature(Mat& feature, double* prob_estimates); private: - //TODO implement getters/setters for some of these members (if apply) - int nr_class; // number of classes + int window_size; // window size + int step_size; // sliding window step + int nr_class; // number of classes int nr_feature; // number of features Mat feature_min; // scale range Mat feature_max; Mat weights; // Logistic Regression weights Mat kernels; // CNN kernels Mat M, P; // ZCA Whitening parameters - int step_size; // sliding window step - int window_size; // window size int quad_size; int patch_size; int num_quads; // extract 25 quads (12x12) from each image @@ -449,26 +515,15 @@ OCRBeamSearchClassifierCNN::OCRBeamSearchClassifierCNN (const string& filename) else CV_Error(Error::StsBadArg, "Default classifier data file not found!"); - // check all matrix dimensions match correctly and no one is empty - CV_Assert( (M.cols > 0) && (M.rows > 0) ); - CV_Assert( (P.cols > 0) && (P.rows > 0) ); - CV_Assert( (kernels.cols > 0) && (kernels.rows > 0) ); - CV_Assert( (weights.cols > 0) && (weights.rows > 0) ); - CV_Assert( (feature_min.cols > 0) && (feature_min.rows > 0) ); - CV_Assert( (feature_max.cols > 0) && (feature_max.rows > 0) ); - - nr_feature = weights.rows; - nr_class = weights.cols; + nr_feature = weights.rows; + nr_class = weights.cols; patch_size = (int)sqrt(kernels.cols); - // algorithm internal parameters - window_size = 32; + window_size = 4*patch_size; + step_size = 4; quad_size = 12; num_quads = 25; num_tiles = 25; - alpha = 0.5; - - step_size = 4; // TODO showld this be a parameter for the user? - + alpha = 0.5; // used in non-linear activation function z = max(0, |D*a| - alpha) } void OCRBeamSearchClassifierCNN::eval( InputArray _src, vector< vector >& recognition_probabilities, vector& oversegmentation) @@ -493,7 +548,6 @@ void OCRBeamSearchClassifierCNN::eval( InputArray _src, vector< vector > resize(src,src,Size(window_size*src.cols/src.rows,window_size)); int seg_points = 0; - oversegmentation.push_back(seg_points); Mat quad; Mat tmp; @@ -584,19 +638,17 @@ void OCRBeamSearchClassifierCNN::eval( InputArray _src, vector< vector > double *p = new double[nr_class]; double predict_label = eval_feature(feature,p); - //cout << " Prediction: " << vocabulary[predict_label] << " with probability " << p[0] << endl; - if (predict_label < 0) - CV_Error(Error::StsInternal, "OCRBeamSearchClassifierCNN::eval Error: unexpected prediction in eval_feature()"); + if ( (predict_label < 0) || (predict_label > nr_class) ) + CV_Error(Error::StsOutOfRange, "OCRBeamSearchClassifierCNN::eval Error: unexpected prediction in eval_feature()"); - seg_points++; - oversegmentation.push_back(seg_points); - vector recognition_p(p, p+nr_class*sizeof(double)); - recognition_probabilities.push_back(recognition_p); + vector recognition_p(p, p+nr_class); + recognition_probabilities.push_back(recognition_p); + oversegmentation.push_back(seg_points); + seg_points++; } - } // normalize for contrast and apply ZCA whitening to a set of image patches