adds MSERsToERStats function and its documentation; adds webcam_demo sample code and modifies cmakelists to link with features2d module

pull/47/head
lluis 10 years ago
parent bcf38c3fbf
commit 007b1d9cb0
  1. 2
      modules/text/CMakeLists.txt
  2. 14
      modules/text/doc/erfilter.rst
  3. 9
      modules/text/include/opencv2/text/erfilter.hpp
  4. 333
      modules/text/samples/webcam_demo.cpp
  5. 73
      modules/text/src/erfilter.cpp

@ -18,7 +18,7 @@ include_directories(${Tesseract_INCLUDE_DIR})
endif()
set(the_description "Text Detection and Recognition")
ocv_define_module(text opencv_ml opencv_highgui opencv_imgproc opencv_core)
ocv_define_module(text opencv_ml opencv_highgui opencv_imgproc opencv_core opencv_features2d)
if(${Tesseract_FOUND})
target_link_libraries(opencv_text ${Tesseract_LIBS})

@ -81,6 +81,20 @@ An ER is a 4-connected set of pixels with all its grey-level values smaller than
ERStat* prev;
};
MSERsToERStats
--------------
Converts MSER contours (vector<Point>) to ERStat regions.
.. ocv:function:: void MSERsToERStats(InputArray image, vector< vector<Point> > &contours, vector< vector<ERStat> > &regions)
:param image: Source image ``CV_8UC1`` from which the MSERs where extracted.
:param contours: Intput vector with all the contours (vector<Point>).
:param regions: Output where the ERStat regions are stored.
It takes as input the contours provided by the OpenCV MSER feature detector and returns as output two vectors of ERStats. This is because MSER() output contains both MSER+ and MSER- regions in a single vector<Point>, the function separates them in two different vectors (this is as if the ERStats where extracted from two different channels).
An example of MSERsToERStats in use can be found in the text detection webcam_demo: https://github.com/Itseez/opencv_contrib/blob/master/modules/text/samples/webcam_demo.cpp
computeNMChannels
-----------------
Compute the different channels to be processed independently in the N&M algorithm [Neumann12].

@ -274,6 +274,15 @@ CV_EXPORTS void erGrouping(InputArray img, InputArrayOfArrays channels,
const std::string& filename = std::string(),
float minProbablity = 0.5);
/*!
* MSERsToERStats function converts MSER contours (vector<Point>) to ERStat regions.
* It takes as input the contours provided by the OpenCV MSER feature detector and returns as output two vectors
* of ERStats. MSER output contains both MSER+ and MSER- regions in a single vector<Point>, the function separates
* them in two different vectors (this is the ERStats where extracted from two different channels).
* */
CV_EXPORTS void MSERsToERStats(InputArray image, std::vector<std::vector<Point> > &contours,
std::vector<std::vector<ERStat> > &regions);
}
}
#endif // _OPENCV_TEXT_ERFILTER_HPP_

@ -0,0 +1,333 @@
/*
* webcam-demo.cpp
*
* A demo program of End-to-end Scene Text Detection and Recognition.
*
* Created on: Jul 31, 2014
* Author: Lluis Gomez i Bigorda <lgomez AT cvc.uab.es>
*/
#include "opencv2/text.hpp"
#include "opencv2/core/utility.hpp"
#include "opencv2/highgui.hpp"
#include "opencv2/imgproc.hpp"
#include "opencv2/features2d.hpp"
#include <iostream>
using namespace std;
using namespace cv;
using namespace cv::text;
//ERStat extraction is done in parallel for different channels
class Parallel_extractCSER: public cv::ParallelLoopBody
{
private:
vector<Mat> &channels;
vector< vector<ERStat> > &regions;
vector< Ptr<ERFilter> > er_filter1;
vector< Ptr<ERFilter> > er_filter2;
public:
Parallel_extractCSER(vector<Mat> &_channels, vector< vector<ERStat> > &_regions,
vector<Ptr<ERFilter> >_er_filter1, vector<Ptr<ERFilter> >_er_filter2)
: channels(_channels),regions(_regions),er_filter1(_er_filter1),er_filter2(_er_filter2){}
virtual void operator()( const cv::Range &r ) const
{
for (int c=r.start; c < r.end; c++)
{
er_filter1[c]->run(channels[c], regions[c]);
er_filter2[c]->run(channels[c], regions[c]);
}
}
};
//Discard wrongly recognised strings
bool isRepetitive(const string& s);
//Draw ER's in an image via floodFill
void er_draw(vector<Mat> &channels, vector<vector<ERStat> > &regions, vector<Vec2i> group, Mat& segmentation);
//Perform text detection and recognition from webcam
int main(int argc, char* argv[])
{
cout << endl << argv[0] << endl << endl;
cout << "A demo program of End-to-end Scene Text Detection and Recognition using webcam." << endl << endl;
cout << " Usage: " << argv[0] << " [camera_index]" << endl << endl;
cout << " Press 'e' to switch between MSER/CSER regions." << endl;
cout << " Press 'g' to switch between Horizontal and Arbitrary oriented grouping." << endl;
cout << " Press 's' to scale down frame size to 320x240." << endl;
cout << " Press 'ESC' to exit." << endl << endl;
namedWindow("recognition",WINDOW_NORMAL);
bool downsize = false;
int REGION_TYPE = 1;
int GROUPING_ALGORITHM = 0;
int RECOGNITION = 0;
char *region_types_str[2] = {const_cast<char *>("ERStats"), const_cast<char *>("MSER")};
char *grouping_algorithms_str[2] = {const_cast<char *>("exhaustive_search"), const_cast<char *>("multioriented")};
char *recognitions_str[3] = {const_cast<char *>("Tesseract"), const_cast<char *>("NM_chain_features + KNN"), const_cast<char *>("NM_chain_features + MLP")};
Mat frame,grey,orig_grey,out_img;
vector<Mat> channels;
vector<vector<ERStat> > regions(2); //two channels
// Create ERFilter objects with the 1st and 2nd stage default classifiers
// since er algorithm is not reentrant we need one filter for channel
vector< Ptr<ERFilter> > er_filters1;
vector< Ptr<ERFilter> > er_filters2;
for (int i=0; i<2; i++)
{
Ptr<ERFilter> er_filter1 = createERFilterNM1(loadClassifierNM1("trained_classifierNM1.xml"),8,0.00015,0.13,0.2,true,0.1);
Ptr<ERFilter> er_filter2 = createERFilterNM2(loadClassifierNM2("trained_classifierNM2.xml"),0.5);
er_filters1.push_back(er_filter1);
er_filters2.push_back(er_filter2);
}
//Initialize OCR engine
//double t_r = getTickCount();
OCRTesseract *ocr_tess = new OCRTesseract();
//cout << "TIME_OCR_INITIALIZATION_ALT = "<< ((double)getTickCount() - t_r)*1000/getTickFrequency() << endl;
int cam_idx = 0;
if (argc > 1)
cam_idx = atoi(argv[1]);
VideoCapture cap(cam_idx);
if(!cap.isOpened())
{
cout << "ERROR: Cannot open default camera (0)." << endl;
return -1;
}
while (1)
{
double t_all = getTickCount();
cap.read(frame);
if (downsize)
resize(frame,frame,Size(320,240));
/*Text Detection*/
cvtColor(frame,grey,COLOR_RGB2GRAY);
grey.copyTo(orig_grey);
// Extract channels to be processed individually
channels.clear();
channels.push_back(grey);
channels.push_back(255-grey);
regions[0].clear();
regions[1].clear();
//double t_d = (double)getTickCount();
switch (REGION_TYPE)
{
case 0:
{
parallel_for_(cv::Range(0,channels.size()), Parallel_extractCSER(channels,regions,er_filters1,er_filters2));
break;
}
case 1:
{
//Extract MSER
vector<vector<Point> > contours;
MSER(21,(int)(0.00002*grey.cols*grey.rows),(int)(0.05*grey.cols*grey.rows),1,0.7)(grey, contours);
//Convert the output of MSER to suitable input for the grouping/recognition algorithms
if (contours.size() > 0)
MSERsToERStats(grey, contours, regions);
break;
}
case 2:
{
break;
}
}
//cout << "TIME_REGION_DETECTION_ALT = " << ((double)getTickCount() - t_d)*1000/getTickFrequency() << endl;
// Detect character groups
//double t_g = getTickCount();
vector< vector<Vec2i> > nm_region_groups;
vector<Rect> nm_boxes;
switch (GROUPING_ALGORITHM)
{
case 0:
{
erGrouping(frame, channels, regions, nm_region_groups, nm_boxes, ERGROUPING_ORIENTATION_HORIZ);
break;
}
case 1:
{
erGrouping(frame, channels, regions, nm_region_groups, nm_boxes, ERGROUPING_ORIENTATION_ANY, "./trained_classifier_erGrouping.xml", 0.5);
break;
}
}
//cout << "TIME_GROUPING_ALT = " << ((double)getTickCount() - t_g)*1000/getTickFrequency() << endl;
/*Text Recognition (OCR)*/
frame.copyTo(out_img);
float scale_img = 600./frame.rows;
float scale_font = (2-scale_img)/1.4;
vector<string> words_detection;
string output;
//t_r = getTickCount();
for (int i=0; i<(int)nm_boxes.size(); i++)
{
rectangle(out_img, nm_boxes[i].tl(), nm_boxes[i].br(), Scalar(255,255,0),3);
Mat group_img = Mat::zeros(frame.rows+2, frame.cols+2, CV_8UC1);
er_draw(channels, regions, nm_region_groups[i], group_img);
group_img(nm_boxes[i]).copyTo(group_img);
copyMakeBorder(group_img,group_img,15,15,15,15,BORDER_CONSTANT,Scalar(0));
vector<Rect> boxes;
vector<string> words;
vector<float> confidences;
float min_confidence1,min_confidence2;
if (RECOGNITION == 0)
{
ocr_tess->run(group_img, output, &boxes, &words, &confidences, OCR_LEVEL_WORD);
min_confidence1 = 51.;
min_confidence2 = 60.;
}
output.erase(remove(output.begin(), output.end(), '\n'), output.end());
//cout << "OCR output = \"" << output << "\" lenght = " << output.size() << endl;
if (output.size() < 3)
continue;
for (int j=0; j<(int)boxes.size(); j++)
{
boxes[j].x += nm_boxes[i].x-15;
boxes[j].y += nm_boxes[i].y-15;
//cout << " word = " << words[j] << "\t confidence = " << confidences[j] << endl;
if ((words[j].size() < 2) || (confidences[j] < min_confidence1) ||
((words[j].size()==2) && (words[j][0] == words[j][1])) ||
((words[j].size()< 4) && (confidences[j] < min_confidence2)) ||
isRepetitive(words[j]))
continue;
words_detection.push_back(words[j]);
rectangle(out_img, boxes[j].tl(), boxes[j].br(), Scalar(255,0,255),3);
Size word_size = getTextSize(words[j], FONT_HERSHEY_SIMPLEX, scale_font, 3*scale_font, NULL);
rectangle(out_img, boxes[j].tl()-Point(3,word_size.height+3), boxes[j].tl()+Point(word_size.width,0), Scalar(255,0,255),-1);
putText(out_img, words[j], boxes[j].tl()-Point(1,1), FONT_HERSHEY_SIMPLEX, scale_font, Scalar(255,255,255),3*scale_font);
}
}
//cout << "TIME_OCR_ALT = " << ((double)getTickCount() - t_r)*1000/getTickFrequency() << endl;
t_all = ((double)getTickCount() - t_all)*1000/getTickFrequency();
char buff[100];
sprintf(buff, "%2.1f Fps. @ 640x480", (float)(1000/t_all));
string fps_info = buff;
rectangle(out_img, Point(out_img.rows-160,out_img.rows-70), Point(out_img.cols,out_img.rows), Scalar(255,255,255),-1);
putText(out_img, fps_info, Point(10,out_img.rows-10), FONT_HERSHEY_DUPLEX, scale_font, Scalar(255,0,0));
putText(out_img, region_types_str[REGION_TYPE], Point(out_img.rows-150,out_img.rows-50), FONT_HERSHEY_DUPLEX, scale_font, Scalar(255,0,0));
putText(out_img, grouping_algorithms_str[GROUPING_ALGORITHM], Point(out_img.rows-150,out_img.rows-30), FONT_HERSHEY_DUPLEX, scale_font, Scalar(255,0,0));
putText(out_img, recognitions_str[RECOGNITION], Point(out_img.rows-150,out_img.rows-10), FONT_HERSHEY_DUPLEX, scale_font, Scalar(255,0,0));
imshow("recognition", out_img);
//imwrite("recognition_alt.jpg", out_img);
int key = waitKey(30);
if (key == 27) //wait for key
{
cout << "esc key pressed" << endl;
break;
}
else
{
switch (key)
{
case 103: //g
GROUPING_ALGORITHM = (GROUPING_ALGORITHM+1)%2;
cout << "Grouping switched to " << GROUPING_ALGORITHM << endl;
break;
//case 111: //o
// RECOGNITION = (RECOGNITION+1)%3;
// cout << "OCR switched to " << RECOGNITION << endl;
// break;
case 114: //r
REGION_TYPE = (REGION_TYPE+1)%2;
cout << "Regions switched to " << REGION_TYPE << endl;
break;
case 115: //s
downsize = !downsize;
break;
default:
break;
}
}
}
return 0;
}
bool isRepetitive(const string& s)
{
int count = 0;
int count2 = 0;
int count3 = 0;
int first=(int)s[0];
int last=(int)s[(int)s.size()-1];
for (int i=0; i<(int)s.size(); i++)
{
if ((s[i] == 'i') ||
(s[i] == 'l') ||
(s[i] == 'I'))
count++;
if((int)s[i]==first)
count2++;
if((int)s[i]==last)
count3++;
}
if ((count > ((int)s.size()+1)/2) || (count2 == (int)s.size()) || (count3 > ((int)s.size()*2)/3))
{
return true;
}
return false;
}
void er_draw(vector<Mat> &channels, vector<vector<ERStat> > &regions, vector<Vec2i> group, Mat& segmentation)
{
for (int r=0; r<(int)group.size(); r++)
{
ERStat er = regions[group[r][0]][group[r][1]];
if (er.parent != NULL) // deprecate the root region
{
int newMaskVal = 255;
int flags = 4 + (newMaskVal << 8) + FLOODFILL_FIXED_RANGE + FLOODFILL_MASK_ONLY;
floodFill(channels[group[r][0]],segmentation,Point(er.pixel%channels[group[r][0]].cols,er.pixel/channels[group[r][0]].cols),
Scalar(255),0,Scalar(er.level),Scalar(0),flags);
}
}
}

@ -4069,5 +4069,78 @@ void erGrouping(InputArray image, InputArrayOfArrays channels, vector<vector<ERS
}
/*!
* MSERsToERStats function converts MSER contours (vector<Point>) to ERStat regions.
* It takes as input the contours provided by the OpenCV MSER feature detector and returns as output two vectors
* of ERStats. MSER output contains both MSER+ and MSER- regions in a single vector<Point>, the function separates
* them in two different vectors (this is the ERStats where extracted from two different channels).
* */
void MSERsToERStats(InputArray image, vector<vector<Point> > &contours, vector<vector<ERStat> > &mser_regions)
{
CV_Assert(!contours.empty());
Mat grey = image.getMat();
// assert correct image type
CV_Assert( grey.type() == CV_8UC1 );
if (!mser_regions.empty())
mser_regions.clear();
//MSER output contains both MSER+ and MSER- regions in a single vector but we want them separated
mser_regions.resize(2);
//Append "fake" root region to simulate a tree structure (needed for grouping)
ERStat fake_root;
mser_regions[0].push_back(fake_root);
mser_regions[1].push_back(fake_root);
Mat mask = Mat::zeros(grey.rows, grey.cols, CV_8UC1);
Mat mtmp = Mat::zeros(grey.rows, grey.cols, CV_8UC1);
for (int i=0; i<(int)contours.size(); i++)
{
ERStat cser;
cser.area = contours[i].size();
cser.rect = boundingRect(contours[i]);
float avg_intensity = 0;
const vector<Point>& r = contours[i];
for ( int j = 0; j < (int)r.size(); j++ )
{
Point pt = r[j];
mask.at<unsigned char>(pt) = 255;
avg_intensity += (float)grey.at<unsigned char>(pt)/(int)r.size();
}
double min, max;
Point min_loc, max_loc;
minMaxLoc(grey(cser.rect), &min, &max, &min_loc, &max_loc, mask(cser.rect));
Mat element = getStructuringElement( MORPH_RECT, Size(5,5), Point(2,2) );
dilate( mask(cser.rect), mtmp(cser.rect), element );
absdiff( mtmp(cser.rect), mask(cser.rect), mtmp(cser.rect) );
Scalar mean,std;
meanStdDev(grey(cser.rect), mean, std, mtmp(cser.rect) );
if (avg_intensity < mean[0])
{
cser.level = (int)max;
cser.pixel = (max_loc.y+cser.rect.y)*grey.cols+max_loc.x+cser.rect.x;
cser.parent = &(mser_regions[0][0]);
mser_regions[0].push_back(cser);
}
else
{
cser.level = 255-(int)min;
cser.pixel = (min_loc.y+cser.rect.y)*grey.cols+min_loc.x+cser.rect.x;
cser.parent = &(mser_regions[1][0]);
mser_regions[1].push_back(cser);
}
mask(cser.rect) = 0;
mtmp(cser.rect) = 0;
}
}
}
}

Loading…
Cancel
Save