Overload the run() method in BaseOCR class in order to adapt to different classifier callbacks. The original run() method accepts only one Mat input image, this is expected to be a binarzed image with black and white text and works both with the OCRTesseract class and the OCRHMMDecoder class when the character classifier callback works with binary images (e.g. NM). The new run() method accepts two Mat input parameters. One for the gray scale (or color) source image and the other for a binary mask where each connected component corresponds to a pre-segmented character in the input image. This way the OCRHMMDecoder is able to work with character classifiers that operate in grey scale (or color) images (e.g. a CNN).

10 years ago · ee677a255b
parent c146d37b93
commit ee677a255b
4 changed files with 684 additions and 32 deletions
--- a/modules/text/include/opencv2/text/ocr.hpp
+++ b/modules/text/include/opencv2/text/ocr.hpp
@ -69,6 +69,9 @@ public:
    virtual void run(Mat& image, std::string& output_text, std::vector<Rect>* component_rects=NULL,
                     std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
                     int component_level=0) = 0;
+    virtual void run(Mat& image, Mat& mask, std::string& output_text, std::vector<Rect>* component_rects=NULL,
+                     std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
+                     int component_level=0) = 0;
 };

 /** @brief OCRTesseract class provides an interface with the tesseract-ocr API (v3.02.02) in C++.
@ -106,6 +109,10 @@ public:
                     std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
                     int component_level=0);

+    virtual void run(Mat& image, Mat& mask, std::string& output_text, std::vector<Rect>* component_rects=NULL,
+                     std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
+                     int component_level=0);
+
    /** @brief Creates an instance of the OCRTesseract class. Initializes Tesseract.

    @param datapath the name of the parent directory of tessdata ended with "/", or NULL to use the
@ -170,11 +177,11 @@ public:
 public:
    /** @brief Recognize text using HMM.

-    Takes image on input and returns recognized text in the output_text parameter. Optionally
+    Takes binary image on input and returns recognized text in the output_text parameter. Optionally
    provides also the Rects for individual text elements found (e.g. words), and the list of those
    text elements with their confidence values.

-    @param image Input image CV_8UC1 with a single text line (or word).
+    @param image Input binary image CV_8UC1 with a single text line (or word).

    @param output_text Output text. Most likely character sequence found by the HMM decoder.

@ -193,6 +200,33 @@ public:
                     std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
                     int component_level=0);

+    /** @brief Recognize text using HMM.
+
+    Takes an image and a mask (where each connected component corresponds to a segmented character)
+    on input and returns recognized text in the output_text parameter. Optionally
+    provides also the Rects for individual text elements found (e.g. words), and the list of those
+    text elements with their confidence values.
+
+    @param image Input image CV_8UC1 or CV_8UC3 with a single text line (or word).
+    @param mask Input binary image CV_8UC1 same size as input image. Each connected component in mask corresponds to a segmented character in the input image.
+
+    @param output_text Output text. Most likely character sequence found by the HMM decoder.
+
+    @param component_rects If provided the method will output a list of Rects for the individual
+    text elements found (e.g. words).
+
+    @param component_texts If provided the method will output a list of text strings for the
+    recognition of individual text elements found (e.g. words).
+
+    @param component_confidences If provided the method will output a list of confidence values
+    for the recognition of individual text elements found (e.g. words).
+
+    @param component_level Only OCR_LEVEL_WORD is supported.
+     */
+    virtual void run(Mat& image, Mat& mask, std::string& output_text, std::vector<Rect>* component_rects=NULL,
+                     std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
+                     int component_level=0);
+
    /** @brief Creates an instance of the OCRHMMDecoder class. Initializes HMMDecoder.

    @param classifier The character classifier with built in feature extractor.
@ -231,7 +265,7 @@ protected:

@param filename The XML or YAML file with the classifier model (e.g. OCRHMM_knn_model_data.xml)

-The default classifier is based in the scene text recognition method proposed by Lukás Neumann &
+The KNN default classifier is based in the scene text recognition method proposed by Lukás Neumann &
 Jiri Matas in [Neumann11b]. Basically, the region (contour) in the input image is normalized to a
 fixed size, while retaining the centroid and aspect ratio, in order to extract a feature vector
 based on gradient orientations along the chain-code of its perimeter. Then, the region is classified
@ -240,19 +274,32 @@ types.
 */
 CV_EXPORTS Ptr<OCRHMMDecoder::ClassifierCallback> loadOCRHMMClassifierNM(const std::string& filename);

-/** @brief Utility function to create a tailored language model transitions table from a given list of words (lexicon).
+/** @brief Allow to implicitly load the default character classifier when creating an OCRHMMDecoder object.

-@param vocabulary The language vocabulary (chars when ascii english text).
+@param filename The XML or YAML file with the classifier model (e.g. OCRBeamSearch_CNN_model_data.xml.gz)

-@param lexicon The list of words that are expected to be found in a particular image.
+The CNN default classifier is based in the scene text recognition method proposed by Adam Coates &
+Andrew NG in [Coates11a]. The character classifier consists in a Single Layer Convolutional Neural Network and
+a linear classifier. It is applied to the input image in a sliding window fashion, providing a set of recognitions
+at each window location.
+ */
+CV_EXPORTS Ptr<OCRHMMDecoder::ClassifierCallback> loadOCRHMMClassifierCNN(const std::string& filename);

-@param transition_probabilities_table Output table with transition probabilities between character pairs. cols == rows == vocabulary.size().
+//! @}

-The function calculate frequency statistics of character pairs from the given lexicon and fills the output transition_probabilities_table with them. The transition_probabilities_table can be used as input in the OCRHMMDecoder::create() and OCRBeamSearchDecoder::create() methods.
-@note
-   -   (C++) An alternative would be to load the default generic language transition table provided in the text module samples folder (created from ispell 42869 english words list) :
-        <https://github.com/Itseez/opencv_contrib/blob/master/modules/text/samples/OCRHMM_transitions_table.xml>
- */
+/** @brief Utility function to create a tailored language model transitions table from a given list of words (lexicon).
+ *
+ * @param vocabulary The language vocabulary (chars when ascii english text).
+ *
+ * @param lexicon The list of words that are expected to be found in a particular image.
+ *
+ * @param transition_probabilities_table Output table with transition probabilities between character pairs. cols == rows == vocabulary.size().
+ *
+ * The function calculate frequency statistics of character pairs from the given lexicon and fills the output transition_probabilities_table with them. The transition_probabilities_table can be used as input in the OCRHMMDecoder::create() and OCRBeamSearchDecoder::create() methods.
+ * @note
+ *    -   (C++) An alternative would be to load the default generic language transition table provided in the text module samples folder (created from ispell 42869 english words list) :
+ *            <https://github.com/Itseez/opencv_contrib/blob/master/modules/text/samples/OCRHMM_transitions_table.xml>
+ *             */
 CV_EXPORTS void createOCRHMMTransitionsTable(std::string& vocabulary, std::vector<std::string>& lexicon, OutputArray transition_probabilities_table);


@ -319,6 +366,10 @@ public:
                     std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
                     int component_level=0);

+    virtual void run(Mat& image, Mat& mask, std::string& output_text, std::vector<Rect>* component_rects=NULL,
+                     std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
+                     int component_level=0);
+
    /** @brief Creates an instance of the OCRBeamSearchDecoder class. Initializes HMMDecoder.

    @param classifier The character classifier with built in feature extractor.
@ -359,10 +410,10 @@ protected:

 /** @brief Allow to implicitly load the default character classifier when creating an OCRBeamSearchDecoder object.

-@param filename The XML or YAML file with the classifier model (e.g. OCRHMM_knn_model_data.xml)
+@param filename The XML or YAML file with the classifier model (e.g. OCRBeamSearch_CNN_model_data.xml.gz)

-The default classifier is based in the scene text recognition method proposed by Adam Coates &
-Andrew NG in [Coates11a]. The character classifier sonsists in a Single Layer Convolutional Neural Network and
+The CNN default classifier is based in the scene text recognition method proposed by Adam Coates &
+Andrew NG in [Coates11a]. The character classifier consists in a Single Layer Convolutional Neural Network and
 a linear classifier. It is applied to the input image in a sliding window fashion, providing a set of recognitions
 at each window location.
 */
--- a/modules/text/src/ocr_beamsearch_decoder.cpp
+++ b/modules/text/src/ocr_beamsearch_decoder.cpp
@ -73,6 +73,22 @@ void OCRBeamSearchDecoder::run(Mat& image, string& output_text, vector<Rect>* co
        component_confidences->clear();
 }

+void OCRBeamSearchDecoder::run(Mat& image, Mat& mask, string& output_text, vector<Rect>* component_rects,
+                               vector<string>* component_texts, vector<float>* component_confidences,
+                               int component_level)
+{
+    CV_Assert( (image.type() == CV_8UC1) || (image.type() == CV_8UC3) );
+    CV_Assert( mask.type() == CV_8UC1 );
+    CV_Assert( (component_level == OCR_LEVEL_TEXTLINE) || (component_level == OCR_LEVEL_WORD) );
+    output_text.clear();
+    if (component_rects != NULL)
+        component_rects->clear();
+    if (component_texts != NULL)
+        component_texts->clear();
+    if (component_confidences != NULL)
+        component_confidences->clear();
+}
+

 void OCRBeamSearchDecoder::ClassifierCallback::eval( InputArray image, vector< vector<double> >& recognition_probabilities, vector<int>& oversegmentation)
 {
@ -136,7 +152,7 @@ public:
        if (component_confidences != NULL)
            component_confidences->clear();

-        // TODO split a line into words
+        // TODO We must split a line into words or specify we only work with words

        if(src.type() == CV_8UC3)
        {
@ -174,14 +190,7 @@ public:
        }


-        //TODO it would be interesting to have a hash table with a vector of booleans
-        // but this is not possible when we have a large number of possible segmentations.
-        //vector<bool> visited_nodes(pow(2,oversegmentation.size()),false); // hash table for visited nodes
-        // options are using std::set<unsigned long long int> to store only the keys of visited nodes
-        // but will deteriorate the time performance.
        set<unsigned long long int> visited_nodes; //TODO make it member of class
-        // it is also possible to reduce the number of seg. points in some way (e.g. use only seg.points
-        // for which there is a change on the class prediction)

        vector<int> start_segmentation;
        start_segmentation.push_back(oversegmentation[0]);
@ -221,6 +230,21 @@ public:
        return;
    }

+    void run( Mat& src,
+              Mat& mask,
+              string& out_sequence,
+              vector<Rect>* component_rects,
+              vector<string>* component_texts,
+              vector<float>* component_confidences,
+              int component_level)
+    {
+
+        CV_Assert( mask.type() == CV_8UC1 );
+
+        // Nothing to do with a mask here. We do slidding window anyway.
+        run( src, out_sequence, component_rects, component_texts, component_confidences, component_level );
+    }
+
 private:

    ////////////////////////////////////////////////////////////
@ -421,22 +445,29 @@ OCRBeamSearchClassifierCNN::OCRBeamSearchClassifierCNN (const string& filename)
        fs["feature_min"] >> feature_min;
        fs["feature_max"] >> feature_max;
        fs.release();
-        // TODO check all matrix dimensions match correctly and no one is empty
    }
    else
        CV_Error(Error::StsBadArg, "Default classifier data file not found!");

-    nr_feature = weights.rows;
-    nr_class   = weights.cols;
-    // TODO some of this can be inferred from the input file (e.g. patch size must be sqrt(filters.cols))
-    step_size   = 4;
+    // check all matrix dimensions match correctly and no one is empty
+    CV_Assert( (M.cols > 0) && (M.rows > 0) );
+    CV_Assert( (P.cols > 0) && (P.rows > 0) );
+    CV_Assert( (kernels.cols > 0) && (kernels.rows > 0) );
+    CV_Assert( (weights.cols > 0) && (weights.rows > 0) );
+    CV_Assert( (feature_min.cols > 0) && (feature_min.rows > 0) );
+    CV_Assert( (feature_max.cols > 0) && (feature_max.rows > 0) );
+
+    nr_feature  = weights.rows;
+    nr_class    = weights.cols;
+    patch_size  = sqrt(kernels.cols);
+    // algorithm internal parameters
    window_size = 32;
    quad_size   = 12;
-    patch_size  = 8;
    num_quads   = 25;
    num_tiles   = 25;
    alpha       = 0.5;

+    step_size   = 4; // TODO showld this be a parameter for the user?

 }

@ -459,7 +490,6 @@ void OCRBeamSearchClassifierCNN::eval( InputArray _src, vector< vector<double> >
        cvtColor(src,src,COLOR_RGB2GRAY);
    }

-    // TODO shall we resize the input image or make a copy ?
    resize(src,src,Size(window_size*src.cols/src.rows,window_size));

    int seg_points = 0;
@ -555,8 +585,8 @@ void OCRBeamSearchClassifierCNN::eval( InputArray _src, vector< vector<double> >
        double *p = new double[nr_class];
        double predict_label = eval_feature(feature,p);
        //cout << " Prediction: " << vocabulary[predict_label] << " with probability " << p[0] << endl;
-        if (predict_label < 0) // TODO use cvError
-            cout << "OCRBeamSearchClassifierCNN::eval Error: unexpected prediction in eval_feature()" << endl;
+        if (predict_label < 0)
+          CV_Error(Error::StsInternal, "OCRBeamSearchClassifierCNN::eval Error: unexpected prediction in eval_feature()");


        seg_points++;
--- a/modules/text/src/ocr_hmm_decoder.cpp
+++ b/modules/text/src/ocr_hmm_decoder.cpp
@ -74,6 +74,22 @@ void OCRHMMDecoder::run(Mat& image, string& output_text, vector<Rect>* component
        component_confidences->clear();
 }

+void OCRHMMDecoder::run(Mat& image, Mat& mask, string& output_text, vector<Rect>* component_rects,
+                        vector<string>* component_texts, vector<float>* component_confidences,
+                        int component_level)
+{
+    CV_Assert( (image.type() == CV_8UC1) || (image.type() == CV_8UC3) );
+    CV_Assert( mask.type() == CV_8UC1 );
+    CV_Assert( (component_level == OCR_LEVEL_TEXTLINE) || (component_level == OCR_LEVEL_WORD) );
+    output_text.clear();
+    if (component_rects != NULL)
+        component_rects->clear();
+    if (component_texts != NULL)
+        component_texts->clear();
+    if (component_confidences != NULL)
+        component_confidences->clear();
+}
+
 void OCRHMMDecoder::ClassifierCallback::eval( InputArray image, vector<int>& out_class, vector<double>& out_confidence)
 {
    CV_Assert(( image.getMat().type() == CV_8UC3 ) || ( image.getMat().type() == CV_8UC1 ));
@ -263,6 +279,263 @@ public:
                    obs.push_back(out_class[0]);
                observations.push_back(out_class);
                confidences.push_back(out_conf);
+                //cout << " out class = " << vocabulary[out_class[0]] << endl;
+            }
+
+
+            //This must be extracted from dictionary, or just assumed to be equal for all characters
+            vector<double> start_p(vocabulary.size());
+            for (int i=0; i<(int)vocabulary.size(); i++)
+                start_p[i] = 1.0/vocabulary.size();
+
+
+            Mat V = Mat::zeros((int)observations.size(),(int)vocabulary.size(),CV_64FC1);
+            vector<string> path(vocabulary.size());
+
+            // Initialize base cases (t == 0)
+            for (int i=0; i<(int)vocabulary.size(); i++)
+            {
+                for (int j=0; j<(int)observations[0].size(); j++)
+                {
+                    emission_p.at<double>(observations[0][j],obs[0]) = confidences[0][j];
+                }
+                V.at<double>(0,i) = start_p[i] * emission_p.at<double>(i,obs[0]);
+                path[i] = vocabulary.at(i);
+            }
+
+
+            // Run Viterbi for t > 0
+            for (int t=1; t<(int)obs.size(); t++)
+            {
+
+                //Dude this has to be done each time!!
+                emission_p = Mat::eye(62,62,CV_64FC1);
+                for (int e=0; e<(int)observations[t].size(); e++)
+                {
+                    emission_p.at<double>(observations[t][e],obs[t]) = confidences[t][e];
+                }
+
+                vector<string> newpath(vocabulary.size());
+
+                for (int i=0; i<(int)vocabulary.size(); i++)
+                {
+                    double max_prob = 0;
+                    int best_idx = 0;
+                    for (int j=0; j<(int)vocabulary.size(); j++)
+                    {
+                        double prob = V.at<double>(t-1,j) * transition_p.at<double>(j,i) * emission_p.at<double>(i,obs[t]);
+                        if ( prob > max_prob)
+                        {
+                            max_prob = prob;
+                            best_idx = j;
+                        }
+                    }
+
+                    V.at<double>(t,i) = max_prob;
+                    newpath[i] = path[best_idx] + vocabulary.at(i);
+                }
+
+                // Don't need to remember the old paths
+                path.swap(newpath);
+            }
+
+            double max_prob = 0;
+            int best_idx = 0;
+            for (int i=0; i<(int)vocabulary.size(); i++)
+            {
+                double prob = V.at<double>((int)obs.size()-1,i);
+                if ( prob > max_prob)
+                {
+                    max_prob = prob;
+                    best_idx = i;
+                }
+            }
+
+            //cout << path[best_idx] << endl;
+            out_sequence = out_sequence+" "+path[best_idx];
+
+            if (component_rects != NULL)
+                component_rects->push_back(words_rect[w]);
+            if (component_texts != NULL)
+                component_texts->push_back(path[best_idx]);
+            if (component_confidences != NULL)
+                component_confidences->push_back((float)max_prob);
+
+        }
+
+        return;
+    }
+
+    void run( Mat& image,
+              Mat& mask,
+              string& out_sequence,
+              vector<Rect>* component_rects,
+              vector<string>* component_texts,
+              vector<float>* component_confidences,
+              int component_level)
+    {
+
+        CV_Assert( (image.type() == CV_8UC1) || (image.type() == CV_8UC3) );
+        CV_Assert( mask.type() == CV_8UC1 );
+        CV_Assert( (image.cols > 0) && (image.rows > 0) );
+        CV_Assert( (image.cols == mask.cols) && (image.rows == mask.rows) );
+        CV_Assert( component_level == OCR_LEVEL_WORD );
+
+        out_sequence.clear();
+        if (component_rects != NULL)
+            component_rects->clear();
+        if (component_texts != NULL)
+            component_texts->clear();
+        if (component_confidences != NULL)
+            component_confidences->clear();
+
+        // First we split a line into words
+        vector<Mat> words_mask;
+        vector<Rect> words_rect;
+
+        /// Find contours
+        vector<vector<Point> > contours;
+        vector<Vec4i> hierarchy;
+        Mat tmp;
+        mask.copyTo(tmp);
+        findContours( tmp, contours, hierarchy, RETR_EXTERNAL, CHAIN_APPROX_SIMPLE, Point(0, 0) );
+        if (contours.size() < 6)
+        {
+            //do not split lines with less than 6 characters
+            words_mask.push_back(mask);
+            words_rect.push_back(Rect(0,0,mask.cols,mask.rows));
+        }
+        else
+        {
+
+            Mat_<float> vector_w((int)mask.cols,1);
+            reduce(mask, vector_w, 0, REDUCE_SUM, -1);
+
+            vector<int> spaces;
+            vector<int> spaces_start;
+            vector<int> spaces_end;
+            int space_count=0;
+            int last_one_idx;
+
+            int s_init = 0, s_end=vector_w.cols;
+            for (int s=0; s<vector_w.cols; s++)
+            {
+                if (vector_w.at<float>(0,s) == 0)
+                   s_init = s+1;
+                else
+                  break;
+            }
+            for (int s=vector_w.cols-1; s>=0; s--)
+            {
+                if (vector_w.at<float>(0,s) == 0)
+                   s_end = s;
+                else
+                  break;
+            }
+
+            for (int s=s_init; s<s_end; s++)
+            {
+                if (vector_w.at<float>(0,s) == 0)
+                {
+                    space_count++;
+                } else {
+                    if (space_count!=0)
+                    {
+                        spaces.push_back(space_count);
+                        spaces_start.push_back(last_one_idx);
+                        spaces_end.push_back(s-1);
+                    }
+                    space_count = 0;
+                    last_one_idx = s;
+                }
+            }
+            Scalar mean_space,std_space;
+            meanStdDev(Mat(spaces),mean_space,std_space);
+            int num_word_spaces = 0;
+            int last_word_space_end = 0;
+            for (int s=0; s<(int)spaces.size(); s++)
+            {
+                if (spaces_end.at(s)-spaces_start.at(s) > mean_space[0]+(mean_space[0]*1.1)) //this 1.1 is a param?
+                {
+                    if (num_word_spaces == 0)
+                    {
+                        //cout << " we have a word from  0  to " << spaces_start.at(s) << endl;
+                        Mat word_mask;
+                        Rect word_rect = Rect(0,0,spaces_start.at(s),mask.rows);
+                        mask(word_rect).copyTo(word_mask);
+
+                        words_mask.push_back(word_mask);
+                        words_rect.push_back(word_rect);
+                    }
+                    else
+                    {
+                        //cout << " we have a word from " << last_word_space_end << " to " << spaces_start.at(s) << endl;
+                        Mat word_mask;
+                        Rect word_rect = Rect(last_word_space_end,0,spaces_start.at(s)-last_word_space_end,mask.rows);
+                        mask(word_rect).copyTo(word_mask);
+
+                        words_mask.push_back(word_mask);
+                        words_rect.push_back(word_rect);
+                    }
+                    num_word_spaces++;
+                    last_word_space_end = spaces_end.at(s);
+                }
+            }
+            //cout << " we have a word from " << last_word_space_end << " to " << vector_w.cols << endl << endl << endl;
+            Mat word_mask;
+            Rect word_rect = Rect(last_word_space_end,0,vector_w.cols-last_word_space_end,mask.rows);
+            mask(word_rect).copyTo(word_mask);
+
+            words_mask.push_back(word_mask);
+            words_rect.push_back(word_rect);
+
+        }
+
+        for (int w=0; w<(int)words_mask.size(); w++)
+        {
+
+            vector< vector<int> > observations;
+            vector< vector<double> > confidences;
+            vector<int> obs;
+            // First find contours and sort by x coordinate of bbox
+            words_mask[w].copyTo(tmp);
+            if (tmp.empty())
+              continue;
+            contours.clear();
+            hierarchy.clear();
+            /// Find contours
+            findContours( tmp, contours, hierarchy, RETR_EXTERNAL, CHAIN_APPROX_SIMPLE, Point(0, 0) );
+            vector<Rect> contours_rect;
+            for (int i=0; i<(int)contours.size(); i++)
+            {
+                contours_rect.push_back(boundingRect(contours[i]));
+            }
+
+            sort(contours_rect.begin(), contours_rect.end(), sort_rect_horiz);
+
+            // Do character recognition foreach contour
+            for (int i=0; i<(int)contours.size(); i++)
+            {
+                vector<int> out_class;
+                vector<double> out_conf;
+                //take the center of the char rect and translate it to the real origin
+                Point char_center = Point(contours_rect.at(i).x+contours_rect.at(i).width/2,
+                                          contours_rect.at(i).y+contours_rect.at(i).height/2);
+                char_center.x += words_rect[w].x;
+                char_center.y += words_rect[w].y;
+                int win_size = max(contours_rect.at(i).width,contours_rect.at(i).height);
+                win_size += win_size*0.6; // add some pixels in the border TODO: is this a parameter for the user space?
+                Rect char_rect = Rect(char_center.x-win_size/2,char_center.y-win_size/2,win_size,win_size);
+                char_rect &= Rect(0,0,image.cols,image.rows);
+                Mat tmp_image;
+                image(char_rect).copyTo(tmp_image);
+
+                classifier->eval(tmp_image,out_class,out_conf);
+                if (!out_class.empty())
+                    obs.push_back(out_class[0]);
+                //cout << " out class = " << vocabulary[out_class[0]] << "(" << out_conf[0] << ")" << endl;
+                observations.push_back(out_class);
+                confidences.push_back(out_conf);
            }


@ -598,6 +871,278 @@ Ptr<OCRHMMDecoder::ClassifierCallback> loadOCRHMMClassifierNM(const std::string&
    return makePtr<OCRHMMClassifierKNN>(filename);
 }

+
+
+class CV_EXPORTS OCRHMMClassifierCNN : public OCRHMMDecoder::ClassifierCallback
+{
+public:
+    //constructor
+    OCRHMMClassifierCNN(const std::string& filename);
+    // Destructor
+    ~OCRHMMClassifierCNN() {}
+
+    void eval( InputArray image, vector<int>& out_class, vector<double>& out_confidence );
+
+protected:
+    void normalizeAndZCA(Mat& patches);
+    double eval_feature(Mat& feature, double* prob_estimates);
+
+private:
+    int nr_class;		 // number of classes
+    int nr_feature;  // number of features
+    Mat feature_min; // scale range
+    Mat feature_max;
+    Mat weights;     // Logistic Regression weights
+    Mat kernels;     // CNN kernels
+    Mat M, P;        // ZCA Whitening parameters
+    int window_size; // window size
+    int quad_size;
+    int patch_size;
+    int num_quads;   // extract 25 quads (12x12) from each image
+    int num_tiles;   // extract 25 patches (8x8) from each quad
+    double alpha;    // used in non-linear activation function z = max(0, |D*a| - alpha)
+};
+
+OCRHMMClassifierCNN::OCRHMMClassifierCNN (const string& filename)
+{
+    if (ifstream(filename.c_str()))
+    {
+        FileStorage fs(filename, FileStorage::READ);
+        // Load kernels bank and withenning params
+        fs["kernels"] >> kernels;
+        fs["M"] >> M;
+        fs["P"] >> P;
+        // Load Logistic Regression weights
+        fs["weights"] >> weights;
+        // Load feature scaling ranges
+        fs["feature_min"] >> feature_min;
+        fs["feature_max"] >> feature_max;
+        fs.release();
+    }
+    else
+        CV_Error(Error::StsBadArg, "Default classifier data file not found!");
+
+    // check all matrix dimensions match correctly and no one is empty
+    CV_Assert( (M.cols > 0) && (M.rows > 0) );
+    CV_Assert( (P.cols > 0) && (P.rows > 0) );
+    CV_Assert( (kernels.cols > 0) && (kernels.rows > 0) );
+    CV_Assert( (weights.cols > 0) && (weights.rows > 0) );
+    CV_Assert( (feature_min.cols > 0) && (feature_min.rows > 0) );
+    CV_Assert( (feature_max.cols > 0) && (feature_max.rows > 0) );
+
+    nr_feature  = weights.rows;
+    nr_class    = weights.cols;
+    patch_size  = sqrt(kernels.cols);
+    // algorithm internal parameters
+    window_size = 32;
+    num_quads   = 25;
+    num_tiles   = 25;
+    quad_size   = 12;
+    alpha       = 0.5;
+}
+
+void OCRHMMClassifierCNN::eval( InputArray _src, vector<int>& out_class, vector<double>& out_confidence )
+{
+
+    CV_Assert(( _src.getMat().type() == CV_8UC3 ) || ( _src.getMat().type() == CV_8UC1 ));
+
+    out_class.clear();
+    out_confidence.clear();
+
+
+    Mat img = _src.getMat();
+    if(img.type() == CV_8UC3)
+    {
+        cvtColor(img,img,COLOR_RGB2GRAY);
+    }
+
+    // shall we resize the input image or make a copy ?
+    resize(img,img,Size(window_size,window_size));
+
+    Mat quad;
+    Mat tmp;
+
+    int patch_count = 0;
+    vector< vector<double> > data_pool(9);
+
+
+    int quad_id = 1;
+    for (int q_x=0; q_x<=window_size-quad_size; q_x=q_x+(quad_size/2-1))
+    {
+        for (int q_y=0; q_y<=window_size-quad_size; q_y=q_y+(quad_size/2-1))
+        {
+            Rect quad_rect = Rect(q_x,q_y,quad_size,quad_size);
+            quad = img(quad_rect);
+
+            //start sliding window (8x8) in each tile and store the patch as row in data_pool
+            for (int w_x=0; w_x<=quad_size-patch_size; w_x++)
+            {
+                for (int w_y=0; w_y<=quad_size-patch_size; w_y++)
+                {
+                    quad(Rect(w_x,w_y,patch_size,patch_size)).copyTo(tmp);
+                    tmp = tmp.reshape(0,1);
+                    tmp.convertTo(tmp, CV_64F);
+                    normalizeAndZCA(tmp);
+                    vector<double> patch;
+                    tmp.copyTo(patch);
+                    if ((quad_id == 1)||(quad_id == 2)||(quad_id == 6)||(quad_id == 7))
+                        data_pool[0].insert(data_pool[0].end(),patch.begin(),patch.end());
+                    if ((quad_id == 2)||(quad_id == 7)||(quad_id == 3)||(quad_id == 8)||(quad_id == 4)||(quad_id == 9))
+                        data_pool[1].insert(data_pool[1].end(),patch.begin(),patch.end());
+                    if ((quad_id == 4)||(quad_id == 9)||(quad_id == 5)||(quad_id == 10))
+                        data_pool[2].insert(data_pool[2].end(),patch.begin(),patch.end());
+                    if ((quad_id == 6)||(quad_id == 11)||(quad_id == 16)||(quad_id == 7)||(quad_id == 12)||(quad_id == 17))
+                        data_pool[3].insert(data_pool[3].end(),patch.begin(),patch.end());
+                    if ((quad_id == 7)||(quad_id == 12)||(quad_id == 17)||(quad_id == 8)||(quad_id == 13)||(quad_id == 18)||(quad_id == 9)||(quad_id == 14)||(quad_id == 19))
+                        data_pool[4].insert(data_pool[4].end(),patch.begin(),patch.end());
+                    if ((quad_id == 9)||(quad_id == 14)||(quad_id == 19)||(quad_id == 10)||(quad_id == 15)||(quad_id == 20))
+                        data_pool[5].insert(data_pool[5].end(),patch.begin(),patch.end());
+                    if ((quad_id == 16)||(quad_id == 21)||(quad_id == 17)||(quad_id == 22))
+                        data_pool[6].insert(data_pool[6].end(),patch.begin(),patch.end());
+                    if ((quad_id == 17)||(quad_id == 22)||(quad_id == 18)||(quad_id == 23)||(quad_id == 19)||(quad_id == 24))
+                        data_pool[7].insert(data_pool[7].end(),patch.begin(),patch.end());
+                    if ((quad_id == 19)||(quad_id == 24)||(quad_id == 20)||(quad_id == 25))
+                        data_pool[8].insert(data_pool[8].end(),patch.begin(),patch.end());
+                    patch_count++;
+                }
+            }
+
+            quad_id++;
+        }
+    }
+
+    //do dot product of each normalized and whitened patch
+    //each pool is averaged and this yields a representation of 9xD
+    Mat feature = Mat::zeros(9,kernels.rows,CV_64FC1);
+    for (int i=0; i<9; i++)
+    {
+        Mat pool = Mat(data_pool[i]);
+        pool = pool.reshape(0,(int)data_pool[i].size()/kernels.cols);
+        for (int p=0; p<pool.rows; p++)
+        {
+            for (int f=0; f<kernels.rows; f++)
+            {
+                feature.row(i).at<double>(0,f) = feature.row(i).at<double>(0,f) + max(0.0,std::abs(pool.row(p).dot(kernels.row(f)))-alpha);
+            }
+        }
+    }
+    feature = feature.reshape(0,1);
+
+
+    // data must be normalized within the range obtained during training
+    double lower = -1.0;
+    double upper =  1.0;
+    for (int k=0; k<feature.cols; k++)
+    {
+        feature.at<double>(0,k) = lower + (upper-lower) *
+                (feature.at<double>(0,k)-feature_min.at<double>(0,k))/
+                (feature_max.at<double>(0,k)-feature_min.at<double>(0,k));
+    }
+
+    double *p = new double[nr_class];
+    double predict_label = eval_feature(feature,p);
+    //cout << " Prediction: " << vocabulary[predict_label] << " with probability " << p[0] << endl;
+    if (predict_label < 0)
+        CV_Error(Error::StsInternal, "OCRHMMClassifierCNN::eval Error: unexpected prediction in eval_feature()");
+
+    out_class.push_back((int)predict_label);
+    out_confidence.push_back(p[(int)predict_label]);
+
+    for (int i = 0; i<nr_class; i++)
+    {
+      if ( (i != (int)predict_label) && (p[i] != 0.) )
+      {
+        out_class.push_back(i);
+        out_confidence.push_back(p[i]);
+      }
+    }
+
+
+}
+
+// normalize for contrast and apply ZCA whitening to a set of image patches
+void OCRHMMClassifierCNN::normalizeAndZCA(Mat& patches)
+{
+
+    //Normalize for contrast
+    for (int i=0; i<patches.rows; i++)
+    {
+        Scalar row_mean, row_std;
+        meanStdDev(patches.row(i),row_mean,row_std);
+        row_std[0] = sqrt(pow(row_std[0],2)*patches.cols/(patches.cols-1)+10);
+        patches.row(i) = (patches.row(i) - row_mean[0]) / row_std[0];
+    }
+
+
+    //ZCA whitening
+    if ((M.dims == 0) || (P.dims == 0))
+    {
+        Mat CC;
+        calcCovarMatrix(patches,CC,M,COVAR_NORMAL|COVAR_ROWS|COVAR_SCALE);
+        CC = CC * patches.rows / (patches.rows-1);
+
+
+        Mat e_val,e_vec;
+        eigen(CC.t(),e_val,e_vec);
+        e_vec = e_vec.t();
+        sqrt(1./(e_val + 0.1), e_val);
+
+
+        Mat V = Mat::zeros(e_vec.rows, e_vec.cols, CV_64FC1);
+        Mat D = Mat::eye(e_vec.rows, e_vec.cols, CV_64FC1);
+
+        for (int i=0; i<e_vec.cols; i++)
+        {
+            e_vec.col(e_vec.cols-i-1).copyTo(V.col(i));
+            D.col(i) = D.col(i) * e_val.at<double>(0,e_val.rows-i-1);
+        }
+
+        P = V * D * V.t();
+    }
+
+    for (int i=0; i<patches.rows; i++)
+        patches.row(i) = patches.row(i) - M;
+
+    patches = patches * P;
+
+}
+
+double OCRHMMClassifierCNN::eval_feature(Mat& feature, double* prob_estimates)
+{
+    for(int i=0;i<nr_class;i++)
+        prob_estimates[i] = 0;
+
+    for(int idx=0; idx<nr_feature; idx++)
+        for(int i=0;i<nr_class;i++)
+            prob_estimates[i] += weights.at<float>(idx,i)*feature.at<double>(0,idx); //TODO use vectorized dot product
+
+    int dec_max_idx = 0;
+    for(int i=1;i<nr_class;i++)
+    {
+        if(prob_estimates[i] > prob_estimates[dec_max_idx])
+            dec_max_idx = i;
+    }
+
+    for(int i=0;i<nr_class;i++)
+        prob_estimates[i]=1/(1+exp(-prob_estimates[i]));
+
+    double sum=0;
+    for(int i=0; i<nr_class; i++)
+        sum+=prob_estimates[i];
+
+    for(int i=0; i<nr_class; i++)
+        prob_estimates[i]=prob_estimates[i]/sum;
+
+    return dec_max_idx;
+}
+
+
+Ptr<OCRHMMDecoder::ClassifierCallback> loadOCRHMMClassifierCNN(const std::string& filename)
+
+{
+    return makePtr<OCRHMMClassifierCNN>(filename);
+}
+
 /** @brief Utility function to create a tailored language model transitions table from a given list of words (lexicon).

@param vocabulary The language vocabulary (chars when ascii english text).
--- a/modules/text/src/ocr_tesseract.cpp
+++ b/modules/text/src/ocr_tesseract.cpp
@ -70,6 +70,22 @@ void OCRTesseract::run(Mat& image, string& output_text, vector<Rect>* component_
        component_confidences->clear();
 }

+void OCRTesseract::run(Mat& image, Mat& mask, string& output_text, vector<Rect>* component_rects,
+                       vector<string>* component_texts, vector<float>* component_confidences,
+                       int component_level)
+{
+    CV_Assert( (image.type() == CV_8UC1) || (image.type() == CV_8UC3) );
+    CV_Assert( mask.type() == CV_8UC1 );
+    CV_Assert( (component_level == OCR_LEVEL_TEXTLINE) || (component_level == OCR_LEVEL_WORD) );
+    output_text.clear();
+    if (component_rects != NULL)
+        component_rects->clear();
+    if (component_texts != NULL)
+        component_texts->clear();
+    if (component_confidences != NULL)
+        component_confidences->clear();
+}
+
 class OCRTesseractImpl : public OCRTesseract
 {
 private:
@ -189,6 +205,16 @@ public:
 #endif
    }

+    void run(Mat& image, Mat& mask, string& output, vector<Rect>* component_rects=NULL,
+             vector<string>* component_texts=NULL, vector<float>* component_confidences=NULL,
+             int component_level=0)
+    {
+        CV_Assert( mask.type() == CV_8UC1 );
+        CV_Assert( (image.type() == CV_8UC1) || (image.type() == CV_8UC3) );
+
+        run( mask, output, component_rects, component_texts, component_confidences, component_level);
+    }
+

 };