From 007b1d9cb07d7f0d086c5fc2f1558608ef57033b Mon Sep 17 00:00:00 2001
From: lluis <lgomez@cvc.uab.es>
Date: Fri, 1 Aug 2014 18:26:20 +0200
Subject: [PATCH] adds MSERsToERStats function and its documentation; adds
 webcam_demo sample code and modifies cmakelists to link with features2d
 module

---
 modules/text/CMakeLists.txt                   |   2 +-
 modules/text/doc/erfilter.rst                 |  14 +
 .../text/include/opencv2/text/erfilter.hpp    |   9 +
 modules/text/samples/webcam_demo.cpp          | 333 ++++++++++++++++++
 modules/text/src/erfilter.cpp                 |  73 ++++
 5 files changed, 430 insertions(+), 1 deletion(-)
 create mode 100644 modules/text/samples/webcam_demo.cpp
diff --git a/modules/text/CMakeLists.txt b/modules/text/CMakeLists.txt
index 4eea5c878..3324bac65 100644
--- a/modules/text/CMakeLists.txt
+++ b/modules/text/CMakeLists.txt
@@ -18,7 +18,7 @@ include_directories(${Tesseract_INCLUDE_DIR})
 endif()
 
 set(the_description "Text Detection and Recognition")
-ocv_define_module(text opencv_ml opencv_highgui opencv_imgproc opencv_core)
+ocv_define_module(text opencv_ml opencv_highgui opencv_imgproc opencv_core opencv_features2d)
 
 if(${Tesseract_FOUND})
   target_link_libraries(opencv_text ${Tesseract_LIBS})
diff --git a/modules/text/doc/erfilter.rst b/modules/text/doc/erfilter.rst
index 685249c99..8539f5e3b 100644
--- a/modules/text/doc/erfilter.rst
+++ b/modules/text/doc/erfilter.rst
@@ -81,6 +81,20 @@ An ER is a 4-connected set of pixels with all its grey-level values smaller than
         ERStat* prev;
     };
 
+MSERsToERStats
+--------------
+Converts MSER contours (vector<Point>) to ERStat regions.
+
+.. ocv:function:: void MSERsToERStats(InputArray image, vector< vector<Point> > &contours, vector< vector<ERStat> > &regions)
+
+    :param image: Source image ``CV_8UC1`` from which the MSERs where extracted.
+    :param contours: Intput vector with all the contours (vector<Point>).
+    :param regions: Output where the ERStat regions are stored.
+
+It takes as input the contours provided by the OpenCV MSER feature detector and returns as output two vectors of ERStats. This is because MSER() output contains both MSER+ and MSER- regions in a single vector<Point>, the function separates them in two different vectors (this is as if the ERStats where extracted from two different channels).
+
+An example of MSERsToERStats in use can be found in the text detection webcam_demo: https://github.com/Itseez/opencv_contrib/blob/master/modules/text/samples/webcam_demo.cpp
+
 computeNMChannels
 -----------------
 Compute the different channels to be processed independently in the N&M algorithm [Neumann12].
diff --git a/modules/text/include/opencv2/text/erfilter.hpp b/modules/text/include/opencv2/text/erfilter.hpp
index 8acb5c523..d03ec8361 100644
--- a/modules/text/include/opencv2/text/erfilter.hpp
+++ b/modules/text/include/opencv2/text/erfilter.hpp
@@ -274,6 +274,15 @@ CV_EXPORTS void erGrouping(InputArray img, InputArrayOfArrays channels,
                                            const std::string& filename = std::string(),
                                            float minProbablity = 0.5);
 
+/*!
+ * MSERsToERStats function converts MSER contours (vector<Point>) to ERStat regions.
+ * It takes as input the contours provided by the OpenCV MSER feature detector and returns as output two vectors
+ * of ERStats. MSER output contains both MSER+ and MSER- regions in a single vector<Point>, the function separates
+ * them in two different vectors (this is the ERStats where extracted from two different channels).
+ * */
+CV_EXPORTS void MSERsToERStats(InputArray image, std::vector<std::vector<Point> > &contours,
+                               std::vector<std::vector<ERStat> > &regions);
+
 }
 }
 #endif // _OPENCV_TEXT_ERFILTER_HPP_
diff --git a/modules/text/samples/webcam_demo.cpp b/modules/text/samples/webcam_demo.cpp
new file mode 100644
index 000000000..79ec7d117
--- /dev/null
+++ b/modules/text/samples/webcam_demo.cpp
@@ -0,0 +1,333 @@
+/*
+ * webcam-demo.cpp
+ *
+ * A demo program of End-to-end Scene Text Detection and Recognition.
+ *
+ * Created on: Jul 31, 2014
+ *     Author: Lluis Gomez i Bigorda <lgomez AT cvc.uab.es>
+ */
+
+#include "opencv2/text.hpp"
+#include "opencv2/core/utility.hpp"
+#include "opencv2/highgui.hpp"
+#include "opencv2/imgproc.hpp"
+#include "opencv2/features2d.hpp"
+
+#include <iostream>
+
+
+using namespace std;
+using namespace cv;
+using namespace cv::text;
+
+//ERStat extraction is done in parallel for different channels
+class Parallel_extractCSER: public cv::ParallelLoopBody
+{
+private:
+    vector<Mat> &channels;
+    vector< vector<ERStat> > &regions;
+    vector< Ptr<ERFilter> > er_filter1;
+    vector< Ptr<ERFilter> > er_filter2;
+
+public:
+    Parallel_extractCSER(vector<Mat> &_channels, vector< vector<ERStat> > &_regions,
+                         vector<Ptr<ERFilter> >_er_filter1, vector<Ptr<ERFilter> >_er_filter2)
+        : channels(_channels),regions(_regions),er_filter1(_er_filter1),er_filter2(_er_filter2){}
+
+    virtual void operator()( const cv::Range &r ) const
+    {
+        for (int c=r.start; c < r.end; c++)
+        {
+            er_filter1[c]->run(channels[c], regions[c]);
+            er_filter2[c]->run(channels[c], regions[c]);
+        }
+    }
+};
+
+
+//Discard wrongly recognised strings
+bool   isRepetitive(const string& s);
+//Draw ER's in an image via floodFill
+void   er_draw(vector<Mat> &channels, vector<vector<ERStat> > &regions, vector<Vec2i> group, Mat& segmentation);
+
+//Perform text detection and recognition from webcam
+int main(int argc, char* argv[])
+{
+    cout << endl << argv[0] << endl << endl;
+    cout << "A demo program of End-to-end Scene Text Detection and Recognition using webcam." << endl << endl;
+    cout << "  Usage:  " << argv[0] << " [camera_index]" << endl << endl;
+    cout << "  Press 'e' to switch between MSER/CSER regions." << endl;
+    cout << "  Press 'g' to switch between Horizontal and Arbitrary oriented grouping." << endl;
+    cout << "  Press 's' to scale down frame size to 320x240." << endl;
+    cout << "  Press 'ESC' to exit." << endl << endl;
+
+    namedWindow("recognition",WINDOW_NORMAL);
+    bool downsize = false;
+    int  REGION_TYPE = 1;
+    int  GROUPING_ALGORITHM = 0;
+    int  RECOGNITION = 0;
+    char *region_types_str[2] = {const_cast<char *>("ERStats"), const_cast<char *>("MSER")};
+    char *grouping_algorithms_str[2] = {const_cast<char *>("exhaustive_search"), const_cast<char *>("multioriented")};
+    char *recognitions_str[3] = {const_cast<char *>("Tesseract"), const_cast<char *>("NM_chain_features + KNN"), const_cast<char *>("NM_chain_features + MLP")};
+
+    Mat frame,grey,orig_grey,out_img;
+    vector<Mat> channels;
+    vector<vector<ERStat> > regions(2); //two channels
+
+    // Create ERFilter objects with the 1st and 2nd stage default classifiers
+    // since er algorithm is not reentrant we need one filter for channel
+    vector< Ptr<ERFilter> > er_filters1;
+    vector< Ptr<ERFilter> > er_filters2;
+    for (int i=0; i<2; i++)
+    {
+        Ptr<ERFilter> er_filter1 = createERFilterNM1(loadClassifierNM1("trained_classifierNM1.xml"),8,0.00015,0.13,0.2,true,0.1);
+        Ptr<ERFilter> er_filter2 = createERFilterNM2(loadClassifierNM2("trained_classifierNM2.xml"),0.5);
+        er_filters1.push_back(er_filter1);
+        er_filters2.push_back(er_filter2);
+    }
+
+    //Initialize OCR engine
+    //double t_r = getTickCount();
+
+    OCRTesseract *ocr_tess = new OCRTesseract();
+
+    //cout << "TIME_OCR_INITIALIZATION_ALT = "<< ((double)getTickCount() - t_r)*1000/getTickFrequency() << endl;
+
+
+    int cam_idx = 0;
+    if (argc > 1)
+        cam_idx = atoi(argv[1]);
+
+    VideoCapture cap(cam_idx);
+    if(!cap.isOpened())
+    {
+        cout << "ERROR: Cannot open default camera (0)." << endl;
+        return -1;
+    }
+
+    while (1)
+    {
+        double t_all = getTickCount();
+
+        cap.read(frame);
+
+
+        if (downsize)
+            resize(frame,frame,Size(320,240));
+
+        /*Text Detection*/
+
+        cvtColor(frame,grey,COLOR_RGB2GRAY);
+        grey.copyTo(orig_grey);
+        // Extract channels to be processed individually
+        channels.clear();
+        channels.push_back(grey);
+        channels.push_back(255-grey);
+
+
+        regions[0].clear();
+        regions[1].clear();
+        //double t_d = (double)getTickCount();
+
+        switch (REGION_TYPE)
+        {
+        case 0:
+        {
+            parallel_for_(cv::Range(0,channels.size()), Parallel_extractCSER(channels,regions,er_filters1,er_filters2));
+            break;
+        }
+        case 1:
+        {
+            //Extract MSER
+            vector<vector<Point> > contours;
+            MSER(21,(int)(0.00002*grey.cols*grey.rows),(int)(0.05*grey.cols*grey.rows),1,0.7)(grey, contours);
+
+            //Convert the output of MSER to suitable input for the grouping/recognition algorithms
+            if (contours.size() > 0)
+                MSERsToERStats(grey, contours, regions);
+
+            break;
+        }
+        case 2:
+        {
+            break;
+        }
+        }
+        //cout << "TIME_REGION_DETECTION_ALT = " << ((double)getTickCount() - t_d)*1000/getTickFrequency() << endl;
+
+        // Detect character groups
+        //double t_g = getTickCount();
+        vector< vector<Vec2i> > nm_region_groups;
+        vector<Rect> nm_boxes;
+        switch (GROUPING_ALGORITHM)
+        {
+        case 0:
+        {
+            erGrouping(frame, channels, regions, nm_region_groups, nm_boxes, ERGROUPING_ORIENTATION_HORIZ);
+            break;
+        }
+        case 1:
+        {
+            erGrouping(frame, channels, regions, nm_region_groups, nm_boxes, ERGROUPING_ORIENTATION_ANY, "./trained_classifier_erGrouping.xml", 0.5);
+            break;
+        }
+        }
+        //cout << "TIME_GROUPING_ALT = " << ((double)getTickCount() - t_g)*1000/getTickFrequency() << endl;
+
+
+
+
+        /*Text Recognition (OCR)*/
+
+
+        frame.copyTo(out_img);
+        float scale_img  = 600./frame.rows;
+        float scale_font = (2-scale_img)/1.4;
+        vector<string> words_detection;
+        string output;
+
+        //t_r = getTickCount();
+
+        for (int i=0; i<(int)nm_boxes.size(); i++)
+        {
+            rectangle(out_img, nm_boxes[i].tl(), nm_boxes[i].br(), Scalar(255,255,0),3);
+
+
+            Mat group_img = Mat::zeros(frame.rows+2, frame.cols+2, CV_8UC1);
+            er_draw(channels, regions, nm_region_groups[i], group_img);
+            group_img(nm_boxes[i]).copyTo(group_img);
+            copyMakeBorder(group_img,group_img,15,15,15,15,BORDER_CONSTANT,Scalar(0));
+
+            vector<Rect>   boxes;
+            vector<string> words;
+            vector<float>  confidences;
+
+
+            float min_confidence1,min_confidence2;
+
+            if (RECOGNITION == 0)
+            {
+                ocr_tess->run(group_img, output, &boxes, &words, &confidences, OCR_LEVEL_WORD);
+                min_confidence1 = 51.;
+                min_confidence2 = 60.;
+            }
+
+            output.erase(remove(output.begin(), output.end(), '\n'), output.end());
+            //cout << "OCR output = \"" << output << "\" lenght = " << output.size() << endl;
+            if (output.size() < 3)
+                continue;
+
+            for (int j=0; j<(int)boxes.size(); j++)
+            {
+                boxes[j].x += nm_boxes[i].x-15;
+                boxes[j].y += nm_boxes[i].y-15;
+
+                //cout << "  word = " << words[j] << "\t confidence = " << confidences[j] << endl;
+                if ((words[j].size() < 2) || (confidences[j] < min_confidence1) ||
+                        ((words[j].size()==2) && (words[j][0] == words[j][1])) ||
+                        ((words[j].size()< 4) && (confidences[j] < min_confidence2)) ||
+                        isRepetitive(words[j]))
+                    continue;
+                words_detection.push_back(words[j]);
+                rectangle(out_img, boxes[j].tl(), boxes[j].br(), Scalar(255,0,255),3);
+                Size word_size = getTextSize(words[j], FONT_HERSHEY_SIMPLEX, scale_font, 3*scale_font, NULL);
+                rectangle(out_img, boxes[j].tl()-Point(3,word_size.height+3), boxes[j].tl()+Point(word_size.width,0), Scalar(255,0,255),-1);
+                putText(out_img, words[j], boxes[j].tl()-Point(1,1), FONT_HERSHEY_SIMPLEX, scale_font, Scalar(255,255,255),3*scale_font);
+            }
+
+        }
+
+        //cout << "TIME_OCR_ALT = " << ((double)getTickCount() - t_r)*1000/getTickFrequency() << endl;
+
+
+        t_all = ((double)getTickCount() - t_all)*1000/getTickFrequency();
+        char buff[100];
+        sprintf(buff, "%2.1f Fps. @ 640x480", (float)(1000/t_all));
+        string fps_info = buff;
+        rectangle(out_img, Point(out_img.rows-160,out_img.rows-70), Point(out_img.cols,out_img.rows), Scalar(255,255,255),-1);
+        putText(out_img, fps_info, Point(10,out_img.rows-10), FONT_HERSHEY_DUPLEX, scale_font, Scalar(255,0,0));
+        putText(out_img, region_types_str[REGION_TYPE], Point(out_img.rows-150,out_img.rows-50), FONT_HERSHEY_DUPLEX, scale_font, Scalar(255,0,0));
+        putText(out_img, grouping_algorithms_str[GROUPING_ALGORITHM], Point(out_img.rows-150,out_img.rows-30), FONT_HERSHEY_DUPLEX, scale_font, Scalar(255,0,0));
+        putText(out_img, recognitions_str[RECOGNITION], Point(out_img.rows-150,out_img.rows-10), FONT_HERSHEY_DUPLEX, scale_font, Scalar(255,0,0));
+
+
+        imshow("recognition", out_img);
+        //imwrite("recognition_alt.jpg", out_img);
+        int key = waitKey(30);
+        if (key == 27) //wait for key
+        {
+            cout << "esc key pressed" << endl;
+            break;
+        }
+        else
+        {
+            switch (key)
+            {
+            case 103: //g
+                GROUPING_ALGORITHM = (GROUPING_ALGORITHM+1)%2;
+                cout << "Grouping switched to " << GROUPING_ALGORITHM << endl;
+                break;
+                //case 111: //o
+                //  RECOGNITION = (RECOGNITION+1)%3;
+                //  cout << "OCR switched to " << RECOGNITION << endl;
+                //  break;
+            case 114: //r
+                REGION_TYPE = (REGION_TYPE+1)%2;
+                cout << "Regions switched to " << REGION_TYPE << endl;
+                break;
+            case 115: //s
+                downsize = !downsize;
+                break;
+            default:
+                break;
+
+            }
+        }
+
+    }
+
+    return 0;
+}
+
+bool isRepetitive(const string& s)
+{
+    int count  = 0;
+    int count2 = 0;
+    int count3 = 0;
+    int first=(int)s[0];
+    int last=(int)s[(int)s.size()-1];
+    for (int i=0; i<(int)s.size(); i++)
+    {
+        if ((s[i] == 'i') ||
+                (s[i] == 'l') ||
+                (s[i] == 'I'))
+            count++;
+        if((int)s[i]==first)
+            count2++;
+        if((int)s[i]==last)
+            count3++;
+    }
+    if ((count > ((int)s.size()+1)/2) || (count2 == (int)s.size()) || (count3 > ((int)s.size()*2)/3))
+    {
+        return true;
+    }
+
+
+    return false;
+}
+
+
+void er_draw(vector<Mat> &channels, vector<vector<ERStat> > &regions, vector<Vec2i> group, Mat& segmentation)
+{
+    for (int r=0; r<(int)group.size(); r++)
+    {
+        ERStat er = regions[group[r][0]][group[r][1]];
+        if (er.parent != NULL) // deprecate the root region
+        {
+            int newMaskVal = 255;
+            int flags = 4 + (newMaskVal << 8) + FLOODFILL_FIXED_RANGE + FLOODFILL_MASK_ONLY;
+            floodFill(channels[group[r][0]],segmentation,Point(er.pixel%channels[group[r][0]].cols,er.pixel/channels[group[r][0]].cols),
+                      Scalar(255),0,Scalar(er.level),Scalar(0),flags);
+        }
+    }
+}
diff --git a/modules/text/src/erfilter.cpp b/modules/text/src/erfilter.cpp
index 19b0e752c..2c9695b1e 100644
--- a/modules/text/src/erfilter.cpp
+++ b/modules/text/src/erfilter.cpp
@@ -4069,5 +4069,78 @@ void erGrouping(InputArray image, InputArrayOfArrays channels, vector<vector<ERS
 
 }
 
+/*!
+ * MSERsToERStats function converts MSER contours (vector<Point>) to ERStat regions.
+ * It takes as input the contours provided by the OpenCV MSER feature detector and returns as output two vectors
+ * of ERStats. MSER output contains both MSER+ and MSER- regions in a single vector<Point>, the function separates
+ * them in two different vectors (this is the ERStats where extracted from two different channels).
+ * */
+void MSERsToERStats(InputArray image, vector<vector<Point> > &contours, vector<vector<ERStat> > &mser_regions)
+{
+
+  CV_Assert(!contours.empty());
+  Mat grey = image.getMat();
+  // assert correct image type
+  CV_Assert( grey.type() == CV_8UC1 );
+  if (!mser_regions.empty())
+    mser_regions.clear();
+
+  //MSER output contains both MSER+ and MSER- regions in a single vector but we want them separated
+  mser_regions.resize(2);
+
+  //Append "fake" root region to simulate a tree structure (needed for grouping)
+  ERStat fake_root;
+  mser_regions[0].push_back(fake_root);
+  mser_regions[1].push_back(fake_root);
+
+  Mat mask = Mat::zeros(grey.rows, grey.cols, CV_8UC1);
+  Mat mtmp = Mat::zeros(grey.rows, grey.cols, CV_8UC1);
+  for (int i=0; i<(int)contours.size(); i++)
+  {
+
+    ERStat cser;
+    cser.area = contours[i].size();
+    cser.rect = boundingRect(contours[i]);
+
+    float avg_intensity = 0;
+    const vector<Point>& r = contours[i];
+    for ( int j = 0; j < (int)r.size(); j++ )
+    {
+      Point pt = r[j];
+      mask.at<unsigned char>(pt) = 255;
+      avg_intensity += (float)grey.at<unsigned char>(pt)/(int)r.size();
+    }
+
+    double min, max;
+    Point min_loc, max_loc;
+    minMaxLoc(grey(cser.rect), &min, &max, &min_loc, &max_loc, mask(cser.rect));
+
+    Mat element = getStructuringElement( MORPH_RECT, Size(5,5), Point(2,2) );
+    dilate( mask(cser.rect), mtmp(cser.rect), element );
+    absdiff( mtmp(cser.rect), mask(cser.rect), mtmp(cser.rect) );
+
+    Scalar mean,std;
+    meanStdDev(grey(cser.rect), mean, std, mtmp(cser.rect) );
+
+    if (avg_intensity < mean[0])
+    {
+      cser.level  = (int)max;
+      cser.pixel  = (max_loc.y+cser.rect.y)*grey.cols+max_loc.x+cser.rect.x;
+      cser.parent = &(mser_regions[0][0]);
+      mser_regions[0].push_back(cser);
+    }
+    else
+    {
+      cser.level  = 255-(int)min;
+      cser.pixel  = (min_loc.y+cser.rect.y)*grey.cols+min_loc.x+cser.rect.x;
+      cser.parent = &(mser_regions[1][0]);
+      mser_regions[1].push_back(cser);
+    }
+
+    mask(cser.rect) = 0;
+    mtmp(cser.rect) = 0;
+  }
+}
+
 }
 }