Adds a first implementation of the OCRBeamSearchDecoder class using the Single Layer CNN character classifier described in Coates, Adam, et al. paper: Text detection and character recognition in scene images with unsupervised feature learning, ICDAR 2011

10 years ago · 52cca0ddc6
parent c05a7e0182
commit 52cca0ddc6
2 changed files with 768 additions and 0 deletions
--- a/modules/text/include/opencv2/text/ocr.hpp
+++ b/modules/text/include/opencv2/text/ocr.hpp
@ -240,6 +240,117 @@ types.
 */
 CV_EXPORTS Ptr<OCRHMMDecoder::ClassifierCallback> loadOCRHMMClassifierNM(const std::string& filename);

+
+/* OCR BeamSearch Decoder */
+
+/** @brief OCRBeamSearchDecoder class provides an interface for OCR using Beam Search algorithm.
+
+@note
+   -   (C++) An example on using OCRBeamSearchDecoder recognition combined with scene text detection can
+        be found at the demo sample:
+        <https://github.com/Itseez/opencv_contrib/blob/master/modules/text/samples/word_recognition.cpp>
+ */
+class CV_EXPORTS OCRBeamSearchDecoder : public BaseOCR
+{
+public:
+
+    /** @brief Callback with the character classifier is made a class.
+
+    This way it hides the feature extractor and the classifier itself, so developers can write
+    their own OCR code.
+
+    The default character classifier and feature extractor can be loaded using the utility funtion
+    loadOCRBeamSearchClassifierCNN with all its parameters provided in
+    <https://github.com/Itseez/opencv_contrib/blob/master/modules/text/samples/OCRBeamSearch_CNN_model_data.xml.gz>.
+     */
+    class CV_EXPORTS ClassifierCallback
+    {
+    public:
+        virtual ~ClassifierCallback() { }
+        /** @brief The character classifier must return a (ranked list of) class(es) id('s)
+
+        @param image Input image CV_8UC1 or CV_8UC3 with a single letter.
+        @param out_class The classifier returns the character class categorical label, or list of
+        class labels, to which the input image corresponds.
+        @param out_confidence The classifier returns the probability of the input image
+        corresponding to each classes in out_class.
+         */
+        virtual void eval( InputArray image, std::vector< std::vector<double> >& recognition_probabilities, std::vector<int>& oversegmentation );
+    };
+
+public:
+    /** @brief Recognize text using Beam Search.
+
+    Takes image on input and returns recognized text in the output_text parameter. Optionally
+    provides also the Rects for individual text elements found (e.g. words), and the list of those
+    text elements with their confidence values.
+
+    @param image Input image CV_8UC1 with a single text line (or word).
+
+    @param output_text Output text. Most likely character sequence found by the HMM decoder.
+
+    @param component_rects If provided the method will output a list of Rects for the individual
+    text elements found (e.g. words).
+
+    @param component_texts If provided the method will output a list of text strings for the
+    recognition of individual text elements found (e.g. words).
+
+    @param component_confidences If provided the method will output a list of confidence values
+    for the recognition of individual text elements found (e.g. words).
+
+    @param component_level Only OCR_LEVEL_WORD is supported.
+     */
+    virtual void run(Mat& image, std::string& output_text, std::vector<Rect>* component_rects=NULL,
+                     std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
+                     int component_level=0);
+
+    /** @brief Creates an instance of the OCRBeamSearchDecoder class. Initializes HMMDecoder.
+
+    @param classifier The character classifier with built in feature extractor.
+
+    @param vocabulary The language vocabulary (chars when ascii english text). vocabulary.size()
+    must be equal to the number of classes of the classifier.
+
+    @param transition_probabilities_table Table with transition probabilities between character
+    pairs. cols == rows == vocabulary.size().
+
+    @param emission_probabilities_table Table with observation emission probabilities. cols ==
+    rows == vocabulary.size().
+
+    @param mode HMM Decoding algorithm. Only OCR_DECODER_VITERBI is available for the moment
+    (<http://en.wikipedia.org/wiki/Viterbi_algorithm>).
+     */
+    static Ptr<OCRBeamSearchDecoder> create(const Ptr<OCRBeamSearchDecoder::ClassifierCallback> classifier,// The character classifier with built in feature extractor
+                                     const std::string& vocabulary,                    // The language vocabulary (chars when ascii english text)
+                                                                                       //     size() must be equal to the number of classes
+                                     InputArray transition_probabilities_table,        // Table with transition probabilities between character pairs
+                                                                                       //     cols == rows == vocabulari.size()
+                                     InputArray emission_probabilities_table,          // Table with observation emission probabilities
+                                                                                       //     cols == rows == vocabulari.size()
+                                     decoder_mode mode = OCR_DECODER_VITERBI,          // HMM Decoding algorithm (only Viterbi for the moment)
+                                     int beam_size = 50);                              // Size of the beam in Beam Search algorithm
+
+protected:
+
+    Ptr<OCRBeamSearchDecoder::ClassifierCallback> classifier;
+    std::string vocabulary;
+    Mat transition_p;
+    Mat emission_p;
+    decoder_mode mode;
+    int beam_size;
+};
+
+/** @brief Allow to implicitly load the default character classifier when creating an OCRBeamSearchDecoder object.
+
+@param filename The XML or YAML file with the classifier model (e.g. OCRHMM_knn_model_data.xml)
+
+The default classifier is based in the scene text recognition method proposed by Adam Coates &
+Andrew NG in [Coates11a]. The character classifier sonsists in a Single Layer Convolutional Neural Network and
+a linear classifier. It is applied to the input image in a sliding window fashion, providing a set of recognitions
+at each window location.
+ */
+CV_EXPORTS Ptr<OCRBeamSearchDecoder::ClassifierCallback> loadOCRBeamSearchClassifierCNN(const std::string& filename);
+
 //! @}

 }
--- a/modules/text/src/ocr_beamsearch_decoder.cpp
+++ b/modules/text/src/ocr_beamsearch_decoder.cpp
@ -0,0 +1,657 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+#include "opencv2/imgproc.hpp"
+#include "opencv2/ml.hpp"
+
+#include <iostream>
+#include <fstream>
+#include <set>
+
+namespace cv
+{
+namespace text
+{
+
+using namespace std;
+using namespace cv::ml;
+
+/* OCR BeamSearch Decoder */
+
+void OCRBeamSearchDecoder::run(Mat& image, string& output_text, vector<Rect>* component_rects,
+                               vector<string>* component_texts, vector<float>* component_confidences,
+                               int component_level)
+{
+    CV_Assert( (image.type() == CV_8UC1) || (image.type() == CV_8UC3) );
+    CV_Assert( (component_level == OCR_LEVEL_TEXTLINE) || (component_level == OCR_LEVEL_WORD) );
+    output_text.clear();
+    if (component_rects != NULL)
+        component_rects->clear();
+    if (component_texts != NULL)
+        component_texts->clear();
+    if (component_confidences != NULL)
+        component_confidences->clear();
+}
+
+
+void OCRBeamSearchDecoder::ClassifierCallback::eval( InputArray image, vector< vector<double> >& recognition_probabilities, vector<int>& oversegmentation)
+{
+    CV_Assert(( image.getMat().type() == CV_8UC3 ) || ( image.getMat().type() == CV_8UC1 ));
+    if (!recognition_probabilities.empty())
+    {
+        for (size_t i=0; i<recognition_probabilities.size(); i++)
+            recognition_probabilities[i].clear();
+    }
+    recognition_probabilities.clear();
+    oversegmentation.clear();
+}
+
+
+bool beam_sort_function ( pair< double,vector<int> > i, pair< double,vector<int> > j );
+bool beam_sort_function ( pair< double,vector<int> > i, pair< double,vector<int> > j )
+{
+    return (i.first > j.first);
+}
+
+
+class OCRBeamSearchDecoderImpl : public OCRBeamSearchDecoder
+{
+public:
+    //Default constructor
+    OCRBeamSearchDecoderImpl( Ptr<OCRBeamSearchDecoder::ClassifierCallback> _classifier,
+                              const string& _vocabulary,
+                              InputArray transition_probabilities_table,
+                              InputArray emission_probabilities_table,
+                              decoder_mode _mode,
+                              int _beam_size)
+    {
+        classifier = _classifier;
+        transition_p = transition_probabilities_table.getMat();
+        emission_p = emission_probabilities_table.getMat();
+        vocabulary = _vocabulary;
+        mode = _mode;
+        beam_size = _beam_size;
+    }
+
+    ~OCRBeamSearchDecoderImpl()
+    {
+    }
+
+    void run( Mat& src,
+              string& out_sequence,
+              vector<Rect>* component_rects,
+              vector<string>* component_texts,
+              vector<float>* component_confidences,
+              int component_level)
+    {
+
+        CV_Assert( (src.type() == CV_8UC1) || (src.type() == CV_8UC3) );
+        CV_Assert( (src.cols > 0) && (src.rows > 0) );
+        CV_Assert( component_level == OCR_LEVEL_WORD );
+        out_sequence.clear();
+        if (component_rects != NULL)
+            component_rects->clear();
+        if (component_texts != NULL)
+            component_texts->clear();
+        if (component_confidences != NULL)
+            component_confidences->clear();
+
+        // TODO split a line into words
+
+        if(src.type() == CV_8UC3)
+        {
+            cvtColor(src,src,COLOR_RGB2GRAY);
+        }
+
+
+        vector< vector<double> > recognition_probabilities;
+        vector<int> oversegmentation;
+
+        classifier->eval(src, recognition_probabilities, oversegmentation);
+
+        /*Now we go here with the beam search algorithm to optimize the recognition score*/
+
+        //convert probabilities to log probabilities
+        for (size_t i=0; i<recognition_probabilities.size(); i++)
+        {
+            for (size_t j=0; j<recognition_probabilities[i].size(); j++)
+            {
+                if (recognition_probabilities[i][j] == 0)
+                    recognition_probabilities[i][j] = -DBL_MAX;
+                else
+                    recognition_probabilities[i][j] = log(recognition_probabilities[i][j]);
+            }
+        }
+        for (int i=0; i<transition_p.rows; i++)
+        {
+            for (int j=0; j<transition_p.cols; j++)
+            {
+                if (transition_p.at<double>(i,j) == 0)
+                    transition_p.at<double>(i,j) = -DBL_MAX;
+                else
+                    transition_p.at<double>(i,j) = log(transition_p.at<double>(i,j));
+            }
+        }
+
+
+        //TODO it would be interesting to have a hash table with a vector of booleans
+        // but this is not possible when we have a large number of possible segmentations.
+        //vector<bool> visited_nodes(pow(2,oversegmentation.size()),false); // hash table for visited nodes
+        // options are using std::set<unsigned long long int> to store only the keys of visited nodes
+        // but will deteriorate the time performance.
+        set<unsigned long long int> visited_nodes; //TODO make it member of class
+        // it is also possible to reduce the number of seg. points in some way (e.g. use only seg.points
+        // for which there is a change on the class prediction)
+
+        vector<int> start_segmentation;
+        start_segmentation.push_back(oversegmentation[0]);
+        start_segmentation.push_back(oversegmentation[oversegmentation.size()-1]);
+
+        vector< pair< double,vector<int> > > beam;
+        beam.push_back( pair< double,vector<int> > (score_segmentation(start_segmentation, recognition_probabilities, out_sequence), start_segmentation) );
+
+        vector< vector<int> > childs = generate_childs(start_segmentation,oversegmentation, visited_nodes);
+        if (!childs.empty())
+            update_beam( beam, childs, recognition_probabilities);
+        //cout << "beam size " << beam.size() << " best score " << beam[0].first<< endl;
+
+        int generated_chids = childs.size();
+        while (generated_chids != 0)
+        {
+            generated_chids = 0;
+            vector< pair< double,vector<int> > > old_beam = beam;
+
+            for (size_t i=0; i<old_beam.size(); i++)
+            {
+                childs = generate_childs(old_beam[i].second,oversegmentation, visited_nodes);
+                if (!childs.empty())
+                    update_beam( beam, childs, recognition_probabilities);
+                generated_chids += childs.size();
+            }
+            //cout << "beam size " << beam.size() << " best score " << beam[0].first << endl;
+        }
+
+
+        // FINISHED ! Get the best prediction found into out_sequence
+        score_segmentation(beam[0].second, recognition_probabilities, out_sequence);
+
+
+        // TODO fill other output parameters
+
+        return;
+    }
+
+private:
+
+    ////////////////////////////////////////////////////////////
+
+    // TODO the way we expand nodes makes the recognition score heuristic not monotonic
+    // it should start from left node 0 and grow always to the right.
+
+    vector< vector<int> > generate_childs(vector<int> &segmentation, vector<int> &oversegmentation, set<unsigned long long int> &visited_nodes)
+    {
+        /*cout << " generate childs  for [";
+  for (size_t i = 0 ; i < segmentation .size(); i++)
+      cout << segmentation[i] << ",";
+  cout << "] ";*/
+
+        vector< vector<int> > childs;
+        for (size_t i=0; i<oversegmentation.size(); i++)
+        {
+            int seg_point = oversegmentation[i];
+            if (find(segmentation.begin(), segmentation.end(), seg_point) == segmentation.end())
+            {
+                //cout << seg_point << " " ;
+                vector<int> child = segmentation;
+                child.push_back(seg_point);
+                sort(child.begin(), child.end());
+                unsigned long long int key = 0;
+                for (size_t j=0; j<child.size(); j++)
+                {
+                    key += pow(2,oversegmentation.size()-(oversegmentation.end()-find(oversegmentation.begin(), oversegmentation.end(), child[j])));
+                }
+                //if (!visited_nodes[key])
+                if (visited_nodes.find(key) == visited_nodes.end())
+                {
+                    childs.push_back(child);
+                    //visited_nodes[key] = true;
+                    visited_nodes.insert(key);
+                }
+            }
+        }
+        //cout << endl;
+        return childs;
+    }
+
+
+    ////////////////////////////////////////////////////////////
+
+    //TODO shall the beam itself be a member of the class?
+    void update_beam (vector< pair< double,vector<int> > > &beam, vector< vector<int> > &childs, vector< vector<double> > &recognition_probabilities)
+    {
+        string out_sequence;
+        double min_score = -DBL_MAX; //min score value to be part of the beam
+        if ((int)beam.size() == beam_size)
+            min_score = beam[beam.size()-1].first; //last element has the lowest score
+        for (size_t i=0; i<childs.size(); i++)
+        {
+            double score = score_segmentation(childs[i], recognition_probabilities, out_sequence);
+            if (score > min_score)
+            {
+                beam.push_back(pair< double,vector<int> >(score,childs[i]));
+                sort(beam.begin(),beam.end(),beam_sort_function);
+                if ((int)beam.size() > beam_size)
+                {
+                    beam.pop_back();
+                    min_score = beam[beam.size()-1].first;
+                }
+            }
+        }
+    }
+
+
+    ////////////////////////////////////////////////////////////
+    // TODO Add heuristics to the score function (see PhotoOCR paper)
+    // e.g.: in some cases we discard a segmentation because it includes a very large character
+    //       in other cases we do it because the overlapping between two chars is too large
+    //       etc.
+    double score_segmentation(vector<int> &segmentation, vector< vector<double> > &observations, string& outstring)
+    {
+
+        //TODO This must be extracted from dictionary
+        vector<double> start_p(vocabulary.size());
+        for (int i=0; i<(int)vocabulary.size(); i++)
+            start_p[i] = log(1.0/vocabulary.size());
+
+
+        Mat V = Mat::ones((int)segmentation.size()-1,(int)vocabulary.size(),CV_64FC1);
+        V = V * -DBL_MAX;
+        vector<string> path(vocabulary.size());
+
+        // Initialize base cases (t == 0)
+        for (int i=0; i<(int)vocabulary.size(); i++)
+        {
+            V.at<double>(0,i) = start_p[i] + observations[segmentation[1]-1][i];
+            path[i] = vocabulary.at(i);
+        }
+
+
+        // Run Viterbi for t > 0
+        for (int t=1; t<(int)segmentation.size()-1; t++)
+        {
+
+            vector<string> newpath(vocabulary.size());
+
+            for (int i=0; i<(int)vocabulary.size(); i++)
+            {
+                double max_prob = -DBL_MAX;
+                int best_idx = 0;
+                for (int j=0; j<(int)vocabulary.size(); j++)
+                {
+                    double prob = V.at<double>(t-1,j) + transition_p.at<double>(j,i) + observations[segmentation[t+1]-1][i];
+                    if ( prob > max_prob)
+                    {
+                        max_prob = prob;
+                        best_idx = j;
+                    }
+                }
+
+                V.at<double>(t,i) = max_prob;
+                newpath[i] = path[best_idx] + vocabulary.at(i);
+            }
+
+            // Don't need to remember the old paths
+            path.swap(newpath);
+        }
+
+        double max_prob = -DBL_MAX;
+        int best_idx = 0;
+        for (int i=0; i<(int)vocabulary.size(); i++)
+        {
+            double prob = V.at<double>((int)segmentation.size()-2,i);
+            if ( prob > max_prob)
+            {
+                max_prob = prob;
+                best_idx = i;
+            }
+        }
+
+        //cout << " score " << max_prob / (segmentation.size()-1) << " " << path[best_idx] << endl;
+        outstring = path[best_idx];
+        return max_prob / (segmentation.size()-1);
+    }
+
+};
+
+Ptr<OCRBeamSearchDecoder> OCRBeamSearchDecoder::create( Ptr<OCRBeamSearchDecoder::ClassifierCallback> _classifier,
+                                                        const string& _vocabulary,
+                                                        InputArray transition_p,
+                                                        InputArray emission_p,
+                                                        decoder_mode _mode,
+                                                        int _beam_size)
+{
+    return makePtr<OCRBeamSearchDecoderImpl>(_classifier, _vocabulary, transition_p, emission_p, _mode, _beam_size);
+}
+
+
+class CV_EXPORTS OCRBeamSearchClassifierCNN : public OCRBeamSearchDecoder::ClassifierCallback
+{
+public:
+    //constructor
+    OCRBeamSearchClassifierCNN(const std::string& filename);
+    // Destructor
+    ~OCRBeamSearchClassifierCNN() {}
+
+    void eval( InputArray src, vector< vector<double> >& recognition_probabilities, vector<int>& oversegmentation );
+
+protected:
+    void normalizeAndZCA(Mat& patches);
+    double eval_feature(Mat& feature, double* prob_estimates);
+
+private:
+    //TODO implement getters/setters for some of these members (if apply)
+    int nr_class;		 // number of classes
+    int nr_feature;  // number of features
+    Mat feature_min; // scale range
+    Mat feature_max;
+    Mat weights;     // Logistic Regression weights
+    Mat kernels;     // CNN kernels
+    Mat M, P;        // ZCA Whitening parameters
+    int step_size;   // sliding window step
+    int window_size; // window size
+    int quad_size;
+    int patch_size;
+    int num_quads;   // extract 25 quads (12x12) from each image
+    int num_tiles;   // extract 25 patches (8x8) from each quad
+    double alpha;    // used in non-linear activation function z = max(0, |D*a| - alpha)
+};
+
+OCRBeamSearchClassifierCNN::OCRBeamSearchClassifierCNN (const string& filename)
+{
+    if (ifstream(filename.c_str()))
+    {
+        FileStorage fs(filename, FileStorage::READ);
+        // Load kernels bank and withenning params
+        fs["kernels"] >> kernels;
+        fs["M"] >> M;
+        fs["P"] >> P;
+        // Load Logistic Regression weights
+        fs["weights"] >> weights;
+        // Load feature scaling ranges
+        fs["feature_min"] >> feature_min;
+        fs["feature_max"] >> feature_max;
+        fs.release();
+        // TODO check all matrix dimensions match correctly and no one is empty
+    }
+    else
+        CV_Error(Error::StsBadArg, "Default classifier data file not found!");
+
+    nr_feature = weights.rows;
+    nr_class   = weights.cols;
+    // TODO some of this can be inferred from the input file (e.g. patch size must be sqrt(filters.cols))
+    step_size   = 4;
+    window_size = 32;
+    quad_size   = 12;
+    patch_size  = 8;
+    num_quads   = 25;
+    num_tiles   = 25;
+    alpha       = 0.5;
+
+
+}
+
+void OCRBeamSearchClassifierCNN::eval( InputArray _src, vector< vector<double> >& recognition_probabilities, vector<int>& oversegmentation)
+{
+
+    CV_Assert(( _src.getMat().type() == CV_8UC3 ) || ( _src.getMat().type() == CV_8UC1 ));
+    if (!recognition_probabilities.empty())
+    {
+        for (size_t i=0; i<recognition_probabilities.size(); i++)
+            recognition_probabilities[i].clear();
+    }
+    recognition_probabilities.clear();
+    oversegmentation.clear();
+
+
+    Mat src = _src.getMat();
+    if(src.type() == CV_8UC3)
+    {
+        cvtColor(src,src,COLOR_RGB2GRAY);
+    }
+
+    // TODO shall we resize the input image or make a copy ?
+    resize(src,src,Size(window_size*src.cols/src.rows,window_size));
+
+    int seg_points = 0;
+    oversegmentation.push_back(seg_points);
+
+    Mat quad;
+    Mat tmp;
+    Mat img;
+
+    // begin sliding window loop foreach detection window
+    for (int x_c=0; x_c<=src.cols-window_size; x_c=x_c+step_size)
+    {
+
+        img = src(Rect(Point(x_c,0),Size(window_size,window_size)));
+
+        int patch_count = 0;
+        vector< vector<double> > data_pool(9);
+
+
+        int quad_id = 1;
+        for (int q_x=0; q_x<=window_size-quad_size; q_x=q_x+(quad_size/2-1))
+        {
+            for (int q_y=0; q_y<=window_size-quad_size; q_y=q_y+(quad_size/2-1))
+            {
+                Rect quad_rect = Rect(q_x,q_y,quad_size,quad_size);
+                quad = img(quad_rect);
+
+                //start sliding window (8x8) in each tile and store the patch as row in data_pool
+                for (int w_x=0; w_x<=quad_size-patch_size; w_x++)
+                {
+                    for (int w_y=0; w_y<=quad_size-patch_size; w_y++)
+                    {
+                        quad(Rect(w_x,w_y,patch_size,patch_size)).copyTo(tmp);
+                        tmp = tmp.reshape(0,1);
+                        tmp.convertTo(tmp, CV_64F);
+                        normalizeAndZCA(tmp);
+                        vector<double> patch;
+                        tmp.copyTo(patch);
+                        if ((quad_id == 1)||(quad_id == 2)||(quad_id == 6)||(quad_id == 7))
+                            data_pool[0].insert(data_pool[0].end(),patch.begin(),patch.end());
+                        if ((quad_id == 2)||(quad_id == 7)||(quad_id == 3)||(quad_id == 8)||(quad_id == 4)||(quad_id == 9))
+                            data_pool[1].insert(data_pool[1].end(),patch.begin(),patch.end());
+                        if ((quad_id == 4)||(quad_id == 9)||(quad_id == 5)||(quad_id == 10))
+                            data_pool[2].insert(data_pool[2].end(),patch.begin(),patch.end());
+                        if ((quad_id == 6)||(quad_id == 11)||(quad_id == 16)||(quad_id == 7)||(quad_id == 12)||(quad_id == 17))
+                            data_pool[3].insert(data_pool[3].end(),patch.begin(),patch.end());
+                        if ((quad_id == 7)||(quad_id == 12)||(quad_id == 17)||(quad_id == 8)||(quad_id == 13)||(quad_id == 18)||(quad_id == 9)||(quad_id == 14)||(quad_id == 19))
+                            data_pool[4].insert(data_pool[4].end(),patch.begin(),patch.end());
+                        if ((quad_id == 9)||(quad_id == 14)||(quad_id == 19)||(quad_id == 10)||(quad_id == 15)||(quad_id == 20))
+                            data_pool[5].insert(data_pool[5].end(),patch.begin(),patch.end());
+                        if ((quad_id == 16)||(quad_id == 21)||(quad_id == 17)||(quad_id == 22))
+                            data_pool[6].insert(data_pool[6].end(),patch.begin(),patch.end());
+                        if ((quad_id == 17)||(quad_id == 22)||(quad_id == 18)||(quad_id == 23)||(quad_id == 19)||(quad_id == 24))
+                            data_pool[7].insert(data_pool[7].end(),patch.begin(),patch.end());
+                        if ((quad_id == 19)||(quad_id == 24)||(quad_id == 20)||(quad_id == 25))
+                            data_pool[8].insert(data_pool[8].end(),patch.begin(),patch.end());
+                        patch_count++;
+                    }
+                }
+
+                quad_id++;
+            }
+        }
+
+        //do dot product of each normalized and whitened patch
+        //each pool is averaged and this yields a representation of 9xD
+        Mat feature = Mat::zeros(9,kernels.rows,CV_64FC1);
+        for (int i=0; i<9; i++)
+        {
+            Mat pool = Mat(data_pool[i]);
+            pool = pool.reshape(0,data_pool[i].size()/kernels.cols);
+            for (int p=0; p<pool.rows; p++)
+            {
+                for (int f=0; f<kernels.rows; f++)
+                {
+                    feature.row(i).at<double>(0,f) = feature.row(i).at<double>(0,f) + max(0.0,std::abs(pool.row(p).dot(kernels.row(f)))-alpha);
+                }
+            }
+        }
+        feature = feature.reshape(0,1);
+
+
+        // data must be normalized within the range obtained during training
+        double lower = -1.0;
+        double upper =  1.0;
+        for (int k=0; k<feature.cols; k++)
+        {
+            feature.at<double>(0,k) = lower + (upper-lower) *
+                    (feature.at<double>(0,k)-feature_min.at<double>(0,k))/
+                    (feature_max.at<double>(0,k)-feature_min.at<double>(0,k));
+        }
+
+        double probabilities[nr_class];
+        double *p = &probabilities[0];
+        double predict_label = eval_feature(feature,p);
+        //cout << " Prediction: " << vocabulary[predict_label] << " with probability " << p[0] << endl;
+        if (predict_label < 0) // TODO use cvError
+            cout << "OCRBeamSearchClassifierCNN::eval Error: unexpected prediction in eval_feature()" << endl;
+
+
+        seg_points++;
+        oversegmentation.push_back(seg_points);
+        vector<double> recognition_p(probabilities, probabilities+sizeof(probabilities)/sizeof(double));
+        recognition_probabilities.push_back(recognition_p);
+
+    }
+
+
+}
+
+// normalize for contrast and apply ZCA whitening to a set of image patches
+void OCRBeamSearchClassifierCNN::normalizeAndZCA(Mat& patches)
+{
+
+    //Normalize for contrast
+    for (int i=0; i<patches.rows; i++)
+    {
+        Scalar row_mean, row_std;
+        meanStdDev(patches.row(i),row_mean,row_std);
+        row_std[0] = sqrt(pow(row_std[0],2)*patches.cols/(patches.cols-1)+10);
+        patches.row(i) = (patches.row(i) - row_mean[0]) / row_std[0];
+    }
+
+
+    //ZCA whitening
+    if ((M.dims == 0) || (P.dims == 0))
+    {
+        Mat CC;
+        calcCovarMatrix(patches,CC,M,COVAR_NORMAL|COVAR_ROWS|COVAR_SCALE);
+        CC = CC * patches.rows / (patches.rows-1);
+
+
+        Mat e_val,e_vec;
+        eigen(CC.t(),e_val,e_vec);
+        e_vec = e_vec.t();
+        sqrt(1./(e_val + 0.1), e_val);
+
+
+        Mat V = Mat::zeros(e_vec.rows, e_vec.cols, CV_64FC1);
+        Mat D = Mat::eye(e_vec.rows, e_vec.cols, CV_64FC1);
+
+        for (int i=0; i<e_vec.cols; i++)
+        {
+            e_vec.col(e_vec.cols-i-1).copyTo(V.col(i));
+            D.col(i) = D.col(i) * e_val.at<double>(0,e_val.rows-i-1);
+        }
+
+        P = V * D * V.t();
+    }
+
+    for (int i=0; i<patches.rows; i++)
+        patches.row(i) = patches.row(i) - M;
+
+    patches = patches * P;
+
+}
+
+double OCRBeamSearchClassifierCNN::eval_feature(Mat& feature, double* prob_estimates)
+{
+    for(int i=0;i<nr_class;i++)
+        prob_estimates[i] = 0;
+
+    for(int idx=0; idx<nr_feature; idx++)
+        for(int i=0;i<nr_class;i++)
+            prob_estimates[i] += weights.at<float>(idx,i)*feature.at<double>(0,idx); //TODO use vectorized dot product
+
+    int dec_max_idx = 0;
+    for(int i=1;i<nr_class;i++)
+    {
+        if(prob_estimates[i] > prob_estimates[dec_max_idx])
+            dec_max_idx = i;
+    }
+
+    for(int i=0;i<nr_class;i++)
+        prob_estimates[i]=1/(1+exp(-prob_estimates[i]));
+
+    double sum=0;
+    for(int i=0; i<nr_class; i++)
+        sum+=prob_estimates[i];
+
+    for(int i=0; i<nr_class; i++)
+        prob_estimates[i]=prob_estimates[i]/sum;
+
+    return dec_max_idx;
+}
+
+
+Ptr<OCRBeamSearchDecoder::ClassifierCallback> loadOCRBeamSearchClassifierCNN(const std::string& filename)
+
+{
+    return makePtr<OCRBeamSearchClassifierCNN>(filename);
+}
+
+}
+}