Merge pull request #1469 from lluisgomez:scene_text_detection_erGrouping

12 years ago · 1445a29e1c
parent 0ecd7913f8 2837bfd9fa
commit 1445a29e1c
3 changed files with 2011 additions and 76 deletions
--- a/modules/objdetect/include/opencv2/objdetect/erfilter.hpp
+++ b/modules/objdetect/include/opencv2/objdetect/erfilter.hpp
@ -236,5 +236,28 @@ enum { ERFILTER_NM_RGBLGrad = 0,
 */
 CV_EXPORTS void computeNMChannels(InputArray _src, OutputArrayOfArrays _channels, int _mode = ERFILTER_NM_RGBLGrad);

+
+/*!
+    Find groups of Extremal Regions that are organized as text blocks. This function implements
+    the grouping algorithm described in:
+    Gomez L. and Karatzas D.: Multi-script Text Extraction from Natural Scenes, ICDAR 2013.
+    Notice that this implementation constrains the results to horizontally-aligned text and
+    latin script (since ERFilter classifiers are trained only for latin script detection).
+
+    The algorithm combines two different clustering techniques in a single parameter-free procedure
+    to detect groups of regions organized as text. The maximally meaningful groups are fist detected
+    in several feature spaces, where each feature space is a combination of proximity information
+    (x,y coordinates) and a similarity measure (intensity, color, size, gradient magnitude, etc.),
+    thus providing a set of hypotheses of text groups. Evidence Accumulation framework is used to
+    combine all these hypotheses to get the final estimate. Each of the resulting groups are finally
+    heuristically validated in order to assest if they form a valid horizontally-aligned text block.
+
+    \param  src            Vector of sinle channel images CV_8UC1 from wich the regions were extracted.
+    \param  regions        Vector of ER's retreived from the ERFilter algorithm from each channel
+    \param  groups         The output of the algorithm are stored in this parameter as list of rectangles.
+*/
+CV_EXPORTS void erGrouping(InputArrayOfArrays src, std::vector<std::vector<ERStat> > &regions,
+                                                   std::vector<Rect> &groups);
+
 }
 #endif // _OPENCV_ERFILTER_HPP_
--- a/modules/objdetect/src/erfilter.cpp
+++ b/modules/objdetect/src/erfilter.cpp
--- a/samples/cpp/erfilter.cpp
+++ b/samples/cpp/erfilter.cpp
@ -16,105 +16,90 @@
 using  namespace std;
 using  namespace cv;

-void  er_draw(Mat &src, Mat &dst, ERStat& er);
+void show_help_and_exit(const char *cmd);
+void groups_draw(Mat &src, vector<Rect> &groups);
+void er_draw(Mat &src, Mat &dst, ERStat& er);

-void  er_draw(Mat &src, Mat &dst, ERStat& er)
+int  main(int argc, const char * argv[])
 {

-    if (er.parent != NULL) // deprecate the root region
-    {
-        int newMaskVal = 255;
-        int flags = 4 + (newMaskVal << 8) + FLOODFILL_FIXED_RANGE + FLOODFILL_MASK_ONLY;
-        floodFill(src,dst,Point(er.pixel%src.cols,er.pixel/src.cols),Scalar(255),0,Scalar(er.level),Scalar(0),flags);
-    }
+    if (argc < 2) show_help_and_exit(argv[0]);

-}
-
-int  main(int argc, const char * argv[])
-{
+    Mat src = imread(argv[1]);

+    // Extract channels to be processed individually
+    vector<Mat> channels;
+    computeNMChannels(src, channels);

-    vector<ERStat> regions;
+    int cn = (int)channels.size();
+    // Append negative channels to detect ER- (bright regions over dark background)
+    for (int c = 0; c < cn-1; c++)
+        channels.push_back(255-channels[c]);

-    if (argc < 2) {
-        cout << "Demo program of the Extremal Region Filter algorithm described in " << endl;
-        cout << "Neumann L., Matas J.: Real-Time Scene Text Localization and Recognition, CVPR 2012" << endl << endl;
-        cout << "    Usage: " << argv[0] << " input_image <optional_groundtruth_image>" << endl;
-        cout << "    Default classifier files (trained_classifierNM*.xml) should be in ./" << endl;
-        return -1;
-    }
+    // Create ERFilter objects with the 1st and 2nd stage default classifiers
+    Ptr<ERFilter> er_filter1 = createERFilterNM1(loadClassifierNM1("trained_classifierNM1.xml"),8,0.00025,0.13,0.4,true,0.1);
+    Ptr<ERFilter> er_filter2 = createERFilterNM2(loadClassifierNM2("trained_classifierNM2.xml"),0.3);

-    Mat original = imread(argv[1]);
-    Mat gt;
-    if (argc > 2)
+    vector<vector<ERStat> > regions(channels.size());
+    // Apply the default cascade classifier to each independent channel (could be done in parallel)
+    for (int c=0; c<(int)channels.size(); c++)
    {
-        gt = imread(argv[2]);
-        cvtColor(gt, gt, COLOR_RGB2GRAY);
-        threshold(gt, gt, 254, 255, THRESH_BINARY);
+        er_filter1->run(channels[c], regions[c]);
+        er_filter2->run(channels[c], regions[c]);
    }
-    Mat grey(original.size(),CV_8UC1);
-    cvtColor(original,grey,COLOR_RGB2GRAY);
-
-    double t = (double)getTickCount();

-    // Build ER tree and filter with the 1st stage default classifier
-    Ptr<ERFilter> er_filter1 = createERFilterNM1(loadClassifierNM1("trained_classifierNM1.xml"));
+    // Detect character groups
+    vector<Rect> groups;
+    erGrouping(channels, regions, groups);

-    er_filter1->run(grey, regions);
-
-    t = (double)getTickCount() - t;
-    cout << " --------------------------------------------------------------------------------------------------" << endl;
-    cout << "\t FIRST STAGE CLASSIFIER done in " << t * 1000. / getTickFrequency() << " ms." << endl;
-    cout << " --------------------------------------------------------------------------------------------------" << endl;
-    cout << setw(9) << regions.size()+er_filter1->getNumRejected() << "\t Extremal Regions extracted " << endl;
-    cout << setw(9) << regions.size() << "\t Extremal Regions selected by the first stage of the sequential classifier." << endl;
-    cout << "\t \t (saving into out_second_stage.jpg)" << endl;
-    cout << " --------------------------------------------------------------------------------------------------" << endl;
+    // draw groups
+    groups_draw(src, groups);
+    imshow("grouping",src);
+    waitKey(-1);

+    // memory clean-up
    er_filter1.release();
-
-    // draw regions
-    Mat mask = Mat::zeros(grey.rows+2,grey.cols+2,CV_8UC1);
-    for (int r=0; r<(int)regions.size(); r++)
-        er_draw(grey, mask, regions.at(r));
-    mask = 255-mask;
-    imwrite("out_first_stage.jpg", mask);
-
-    if (argc > 2)
+    er_filter2.release();
+    regions.clear();
+    if (!groups.empty())
    {
-        Mat tmp_mask = (255-gt) & (255-mask(Rect(Point(1,1),Size(mask.cols-2,mask.rows-2))));
-        cout << "Recall for the 1st stage filter = " << (float)countNonZero(tmp_mask) / countNonZero(255-gt) << endl;
+        groups.clear();
    }
+}

-    t = (double)getTickCount();

-    // Default second stage classifier
-    Ptr<ERFilter> er_filter2 = createERFilterNM2(loadClassifierNM2("trained_classifierNM2.xml"));
-    er_filter2->run(grey, regions);

-    t = (double)getTickCount() - t;
-    cout << " --------------------------------------------------------------------------------------------------" << endl;
-    cout << "\t SECOND STAGE CLASSIFIER done in " << t * 1000. / getTickFrequency() << " ms." << endl;
-    cout << " --------------------------------------------------------------------------------------------------" << endl;
-    cout << setw(9) << regions.size() << "\t Extremal Regions selected by the second stage of the sequential classifier." << endl;
-    cout << "\t \t (saving into out_second_stage.jpg)" << endl;
-    cout << " --------------------------------------------------------------------------------------------------" << endl;
+// helper functions

-    er_filter2.release();
-
-    // draw regions
-    mask = mask*0;
-    for (int r=0; r<(int)regions.size(); r++)
-        er_draw(grey, mask, regions.at(r));
-    mask = 255-mask;
-    imwrite("out_second_stage.jpg", mask);
+void show_help_and_exit(const char *cmd)
+{
+    cout << endl << cmd << endl << endl;
+    cout << "Demo program of the Extremal Region Filter algorithm described in " << endl;
+    cout << "Neumann L., Matas J.: Real-Time Scene Text Localization and Recognition, CVPR 2012" << endl << endl;
+    cout << "    Usage: " << cmd << " <input_image> " << endl;
+    cout << "    Default classifier files (trained_classifierNM*.xml) must be in current directory" << endl << endl;
+    exit(-1);
+}

-    if (argc > 2)
+void groups_draw(Mat &src, vector<Rect> &groups)
+{
+    for (int i=groups.size()-1; i>=0; i--)
    {
-        Mat tmp_mask = (255-gt) & (255-mask(Rect(Point(1,1),Size(mask.cols-2,mask.rows-2))));
-        cout << "Recall for the 2nd stage filter = " << (float)countNonZero(tmp_mask) / countNonZero(255-gt) << endl;
+        if (src.type() == CV_8UC3)
+            rectangle(src,groups.at(i).tl(),groups.at(i).br(),Scalar( 0, 255, 255 ), 3, 8 );
+        else
+            rectangle(src,groups.at(i).tl(),groups.at(i).br(),Scalar( 255 ), 3, 8 );
    }
+}

-    regions.clear();
+void er_draw(Mat &src, Mat &dst, ERStat& er)
+{
+
+    if (er.parent != NULL) // deprecate the root region
+    {
+        int newMaskVal = 255;
+        int flags = 4 + (newMaskVal << 8) + FLOODFILL_FIXED_RANGE + FLOODFILL_MASK_ONLY;
+        floodFill(src,dst,Point(er.pixel%src.cols,er.pixel/src.cols),Scalar(255),0,Scalar(er.level),Scalar(0),flags);
+    }

 }