From 007b1d9cb07d7f0d086c5fc2f1558608ef57033b Mon Sep 17 00:00:00 2001 From: lluis Date: Fri, 1 Aug 2014 18:26:20 +0200 Subject: [PATCH] adds MSERsToERStats function and its documentation; adds webcam_demo sample code and modifies cmakelists to link with features2d module --- modules/text/CMakeLists.txt | 2 +- modules/text/doc/erfilter.rst | 14 + .../text/include/opencv2/text/erfilter.hpp | 9 + modules/text/samples/webcam_demo.cpp | 333 ++++++++++++++++++ modules/text/src/erfilter.cpp | 73 ++++ 5 files changed, 430 insertions(+), 1 deletion(-) create mode 100644 modules/text/samples/webcam_demo.cpp diff --git a/modules/text/CMakeLists.txt b/modules/text/CMakeLists.txt index 4eea5c878..3324bac65 100644 --- a/modules/text/CMakeLists.txt +++ b/modules/text/CMakeLists.txt @@ -18,7 +18,7 @@ include_directories(${Tesseract_INCLUDE_DIR}) endif() set(the_description "Text Detection and Recognition") -ocv_define_module(text opencv_ml opencv_highgui opencv_imgproc opencv_core) +ocv_define_module(text opencv_ml opencv_highgui opencv_imgproc opencv_core opencv_features2d) if(${Tesseract_FOUND}) target_link_libraries(opencv_text ${Tesseract_LIBS}) diff --git a/modules/text/doc/erfilter.rst b/modules/text/doc/erfilter.rst index 685249c99..8539f5e3b 100644 --- a/modules/text/doc/erfilter.rst +++ b/modules/text/doc/erfilter.rst @@ -81,6 +81,20 @@ An ER is a 4-connected set of pixels with all its grey-level values smaller than ERStat* prev; }; +MSERsToERStats +-------------- +Converts MSER contours (vector) to ERStat regions. + +.. ocv:function:: void MSERsToERStats(InputArray image, vector< vector > &contours, vector< vector > ®ions) + + :param image: Source image ``CV_8UC1`` from which the MSERs where extracted. + :param contours: Intput vector with all the contours (vector). + :param regions: Output where the ERStat regions are stored. + +It takes as input the contours provided by the OpenCV MSER feature detector and returns as output two vectors of ERStats. This is because MSER() output contains both MSER+ and MSER- regions in a single vector, the function separates them in two different vectors (this is as if the ERStats where extracted from two different channels). + +An example of MSERsToERStats in use can be found in the text detection webcam_demo: https://github.com/Itseez/opencv_contrib/blob/master/modules/text/samples/webcam_demo.cpp + computeNMChannels ----------------- Compute the different channels to be processed independently in the N&M algorithm [Neumann12]. diff --git a/modules/text/include/opencv2/text/erfilter.hpp b/modules/text/include/opencv2/text/erfilter.hpp index 8acb5c523..d03ec8361 100644 --- a/modules/text/include/opencv2/text/erfilter.hpp +++ b/modules/text/include/opencv2/text/erfilter.hpp @@ -274,6 +274,15 @@ CV_EXPORTS void erGrouping(InputArray img, InputArrayOfArrays channels, const std::string& filename = std::string(), float minProbablity = 0.5); +/*! + * MSERsToERStats function converts MSER contours (vector) to ERStat regions. + * It takes as input the contours provided by the OpenCV MSER feature detector and returns as output two vectors + * of ERStats. MSER output contains both MSER+ and MSER- regions in a single vector, the function separates + * them in two different vectors (this is the ERStats where extracted from two different channels). + * */ +CV_EXPORTS void MSERsToERStats(InputArray image, std::vector > &contours, + std::vector > ®ions); + } } #endif // _OPENCV_TEXT_ERFILTER_HPP_ diff --git a/modules/text/samples/webcam_demo.cpp b/modules/text/samples/webcam_demo.cpp new file mode 100644 index 000000000..79ec7d117 --- /dev/null +++ b/modules/text/samples/webcam_demo.cpp @@ -0,0 +1,333 @@ +/* + * webcam-demo.cpp + * + * A demo program of End-to-end Scene Text Detection and Recognition. + * + * Created on: Jul 31, 2014 + * Author: Lluis Gomez i Bigorda + */ + +#include "opencv2/text.hpp" +#include "opencv2/core/utility.hpp" +#include "opencv2/highgui.hpp" +#include "opencv2/imgproc.hpp" +#include "opencv2/features2d.hpp" + +#include + + +using namespace std; +using namespace cv; +using namespace cv::text; + +//ERStat extraction is done in parallel for different channels +class Parallel_extractCSER: public cv::ParallelLoopBody +{ +private: + vector &channels; + vector< vector > ®ions; + vector< Ptr > er_filter1; + vector< Ptr > er_filter2; + +public: + Parallel_extractCSER(vector &_channels, vector< vector > &_regions, + vector >_er_filter1, vector >_er_filter2) + : channels(_channels),regions(_regions),er_filter1(_er_filter1),er_filter2(_er_filter2){} + + virtual void operator()( const cv::Range &r ) const + { + for (int c=r.start; c < r.end; c++) + { + er_filter1[c]->run(channels[c], regions[c]); + er_filter2[c]->run(channels[c], regions[c]); + } + } +}; + + +//Discard wrongly recognised strings +bool isRepetitive(const string& s); +//Draw ER's in an image via floodFill +void er_draw(vector &channels, vector > ®ions, vector group, Mat& segmentation); + +//Perform text detection and recognition from webcam +int main(int argc, char* argv[]) +{ + cout << endl << argv[0] << endl << endl; + cout << "A demo program of End-to-end Scene Text Detection and Recognition using webcam." << endl << endl; + cout << " Usage: " << argv[0] << " [camera_index]" << endl << endl; + cout << " Press 'e' to switch between MSER/CSER regions." << endl; + cout << " Press 'g' to switch between Horizontal and Arbitrary oriented grouping." << endl; + cout << " Press 's' to scale down frame size to 320x240." << endl; + cout << " Press 'ESC' to exit." << endl << endl; + + namedWindow("recognition",WINDOW_NORMAL); + bool downsize = false; + int REGION_TYPE = 1; + int GROUPING_ALGORITHM = 0; + int RECOGNITION = 0; + char *region_types_str[2] = {const_cast("ERStats"), const_cast("MSER")}; + char *grouping_algorithms_str[2] = {const_cast("exhaustive_search"), const_cast("multioriented")}; + char *recognitions_str[3] = {const_cast("Tesseract"), const_cast("NM_chain_features + KNN"), const_cast("NM_chain_features + MLP")}; + + Mat frame,grey,orig_grey,out_img; + vector channels; + vector > regions(2); //two channels + + // Create ERFilter objects with the 1st and 2nd stage default classifiers + // since er algorithm is not reentrant we need one filter for channel + vector< Ptr > er_filters1; + vector< Ptr > er_filters2; + for (int i=0; i<2; i++) + { + Ptr er_filter1 = createERFilterNM1(loadClassifierNM1("trained_classifierNM1.xml"),8,0.00015,0.13,0.2,true,0.1); + Ptr er_filter2 = createERFilterNM2(loadClassifierNM2("trained_classifierNM2.xml"),0.5); + er_filters1.push_back(er_filter1); + er_filters2.push_back(er_filter2); + } + + //Initialize OCR engine + //double t_r = getTickCount(); + + OCRTesseract *ocr_tess = new OCRTesseract(); + + //cout << "TIME_OCR_INITIALIZATION_ALT = "<< ((double)getTickCount() - t_r)*1000/getTickFrequency() << endl; + + + int cam_idx = 0; + if (argc > 1) + cam_idx = atoi(argv[1]); + + VideoCapture cap(cam_idx); + if(!cap.isOpened()) + { + cout << "ERROR: Cannot open default camera (0)." << endl; + return -1; + } + + while (1) + { + double t_all = getTickCount(); + + cap.read(frame); + + + if (downsize) + resize(frame,frame,Size(320,240)); + + /*Text Detection*/ + + cvtColor(frame,grey,COLOR_RGB2GRAY); + grey.copyTo(orig_grey); + // Extract channels to be processed individually + channels.clear(); + channels.push_back(grey); + channels.push_back(255-grey); + + + regions[0].clear(); + regions[1].clear(); + //double t_d = (double)getTickCount(); + + switch (REGION_TYPE) + { + case 0: + { + parallel_for_(cv::Range(0,channels.size()), Parallel_extractCSER(channels,regions,er_filters1,er_filters2)); + break; + } + case 1: + { + //Extract MSER + vector > contours; + MSER(21,(int)(0.00002*grey.cols*grey.rows),(int)(0.05*grey.cols*grey.rows),1,0.7)(grey, contours); + + //Convert the output of MSER to suitable input for the grouping/recognition algorithms + if (contours.size() > 0) + MSERsToERStats(grey, contours, regions); + + break; + } + case 2: + { + break; + } + } + //cout << "TIME_REGION_DETECTION_ALT = " << ((double)getTickCount() - t_d)*1000/getTickFrequency() << endl; + + // Detect character groups + //double t_g = getTickCount(); + vector< vector > nm_region_groups; + vector nm_boxes; + switch (GROUPING_ALGORITHM) + { + case 0: + { + erGrouping(frame, channels, regions, nm_region_groups, nm_boxes, ERGROUPING_ORIENTATION_HORIZ); + break; + } + case 1: + { + erGrouping(frame, channels, regions, nm_region_groups, nm_boxes, ERGROUPING_ORIENTATION_ANY, "./trained_classifier_erGrouping.xml", 0.5); + break; + } + } + //cout << "TIME_GROUPING_ALT = " << ((double)getTickCount() - t_g)*1000/getTickFrequency() << endl; + + + + + /*Text Recognition (OCR)*/ + + + frame.copyTo(out_img); + float scale_img = 600./frame.rows; + float scale_font = (2-scale_img)/1.4; + vector words_detection; + string output; + + //t_r = getTickCount(); + + for (int i=0; i<(int)nm_boxes.size(); i++) + { + rectangle(out_img, nm_boxes[i].tl(), nm_boxes[i].br(), Scalar(255,255,0),3); + + + Mat group_img = Mat::zeros(frame.rows+2, frame.cols+2, CV_8UC1); + er_draw(channels, regions, nm_region_groups[i], group_img); + group_img(nm_boxes[i]).copyTo(group_img); + copyMakeBorder(group_img,group_img,15,15,15,15,BORDER_CONSTANT,Scalar(0)); + + vector boxes; + vector words; + vector confidences; + + + float min_confidence1,min_confidence2; + + if (RECOGNITION == 0) + { + ocr_tess->run(group_img, output, &boxes, &words, &confidences, OCR_LEVEL_WORD); + min_confidence1 = 51.; + min_confidence2 = 60.; + } + + output.erase(remove(output.begin(), output.end(), '\n'), output.end()); + //cout << "OCR output = \"" << output << "\" lenght = " << output.size() << endl; + if (output.size() < 3) + continue; + + for (int j=0; j<(int)boxes.size(); j++) + { + boxes[j].x += nm_boxes[i].x-15; + boxes[j].y += nm_boxes[i].y-15; + + //cout << " word = " << words[j] << "\t confidence = " << confidences[j] << endl; + if ((words[j].size() < 2) || (confidences[j] < min_confidence1) || + ((words[j].size()==2) && (words[j][0] == words[j][1])) || + ((words[j].size()< 4) && (confidences[j] < min_confidence2)) || + isRepetitive(words[j])) + continue; + words_detection.push_back(words[j]); + rectangle(out_img, boxes[j].tl(), boxes[j].br(), Scalar(255,0,255),3); + Size word_size = getTextSize(words[j], FONT_HERSHEY_SIMPLEX, scale_font, 3*scale_font, NULL); + rectangle(out_img, boxes[j].tl()-Point(3,word_size.height+3), boxes[j].tl()+Point(word_size.width,0), Scalar(255,0,255),-1); + putText(out_img, words[j], boxes[j].tl()-Point(1,1), FONT_HERSHEY_SIMPLEX, scale_font, Scalar(255,255,255),3*scale_font); + } + + } + + //cout << "TIME_OCR_ALT = " << ((double)getTickCount() - t_r)*1000/getTickFrequency() << endl; + + + t_all = ((double)getTickCount() - t_all)*1000/getTickFrequency(); + char buff[100]; + sprintf(buff, "%2.1f Fps. @ 640x480", (float)(1000/t_all)); + string fps_info = buff; + rectangle(out_img, Point(out_img.rows-160,out_img.rows-70), Point(out_img.cols,out_img.rows), Scalar(255,255,255),-1); + putText(out_img, fps_info, Point(10,out_img.rows-10), FONT_HERSHEY_DUPLEX, scale_font, Scalar(255,0,0)); + putText(out_img, region_types_str[REGION_TYPE], Point(out_img.rows-150,out_img.rows-50), FONT_HERSHEY_DUPLEX, scale_font, Scalar(255,0,0)); + putText(out_img, grouping_algorithms_str[GROUPING_ALGORITHM], Point(out_img.rows-150,out_img.rows-30), FONT_HERSHEY_DUPLEX, scale_font, Scalar(255,0,0)); + putText(out_img, recognitions_str[RECOGNITION], Point(out_img.rows-150,out_img.rows-10), FONT_HERSHEY_DUPLEX, scale_font, Scalar(255,0,0)); + + + imshow("recognition", out_img); + //imwrite("recognition_alt.jpg", out_img); + int key = waitKey(30); + if (key == 27) //wait for key + { + cout << "esc key pressed" << endl; + break; + } + else + { + switch (key) + { + case 103: //g + GROUPING_ALGORITHM = (GROUPING_ALGORITHM+1)%2; + cout << "Grouping switched to " << GROUPING_ALGORITHM << endl; + break; + //case 111: //o + // RECOGNITION = (RECOGNITION+1)%3; + // cout << "OCR switched to " << RECOGNITION << endl; + // break; + case 114: //r + REGION_TYPE = (REGION_TYPE+1)%2; + cout << "Regions switched to " << REGION_TYPE << endl; + break; + case 115: //s + downsize = !downsize; + break; + default: + break; + + } + } + + } + + return 0; +} + +bool isRepetitive(const string& s) +{ + int count = 0; + int count2 = 0; + int count3 = 0; + int first=(int)s[0]; + int last=(int)s[(int)s.size()-1]; + for (int i=0; i<(int)s.size(); i++) + { + if ((s[i] == 'i') || + (s[i] == 'l') || + (s[i] == 'I')) + count++; + if((int)s[i]==first) + count2++; + if((int)s[i]==last) + count3++; + } + if ((count > ((int)s.size()+1)/2) || (count2 == (int)s.size()) || (count3 > ((int)s.size()*2)/3)) + { + return true; + } + + + return false; +} + + +void er_draw(vector &channels, vector > ®ions, vector group, Mat& segmentation) +{ + for (int r=0; r<(int)group.size(); r++) + { + ERStat er = regions[group[r][0]][group[r][1]]; + if (er.parent != NULL) // deprecate the root region + { + int newMaskVal = 255; + int flags = 4 + (newMaskVal << 8) + FLOODFILL_FIXED_RANGE + FLOODFILL_MASK_ONLY; + floodFill(channels[group[r][0]],segmentation,Point(er.pixel%channels[group[r][0]].cols,er.pixel/channels[group[r][0]].cols), + Scalar(255),0,Scalar(er.level),Scalar(0),flags); + } + } +} diff --git a/modules/text/src/erfilter.cpp b/modules/text/src/erfilter.cpp index 19b0e752c..2c9695b1e 100644 --- a/modules/text/src/erfilter.cpp +++ b/modules/text/src/erfilter.cpp @@ -4069,5 +4069,78 @@ void erGrouping(InputArray image, InputArrayOfArrays channels, vector) to ERStat regions. + * It takes as input the contours provided by the OpenCV MSER feature detector and returns as output two vectors + * of ERStats. MSER output contains both MSER+ and MSER- regions in a single vector, the function separates + * them in two different vectors (this is the ERStats where extracted from two different channels). + * */ +void MSERsToERStats(InputArray image, vector > &contours, vector > &mser_regions) +{ + + CV_Assert(!contours.empty()); + Mat grey = image.getMat(); + // assert correct image type + CV_Assert( grey.type() == CV_8UC1 ); + if (!mser_regions.empty()) + mser_regions.clear(); + + //MSER output contains both MSER+ and MSER- regions in a single vector but we want them separated + mser_regions.resize(2); + + //Append "fake" root region to simulate a tree structure (needed for grouping) + ERStat fake_root; + mser_regions[0].push_back(fake_root); + mser_regions[1].push_back(fake_root); + + Mat mask = Mat::zeros(grey.rows, grey.cols, CV_8UC1); + Mat mtmp = Mat::zeros(grey.rows, grey.cols, CV_8UC1); + for (int i=0; i<(int)contours.size(); i++) + { + + ERStat cser; + cser.area = contours[i].size(); + cser.rect = boundingRect(contours[i]); + + float avg_intensity = 0; + const vector& r = contours[i]; + for ( int j = 0; j < (int)r.size(); j++ ) + { + Point pt = r[j]; + mask.at(pt) = 255; + avg_intensity += (float)grey.at(pt)/(int)r.size(); + } + + double min, max; + Point min_loc, max_loc; + minMaxLoc(grey(cser.rect), &min, &max, &min_loc, &max_loc, mask(cser.rect)); + + Mat element = getStructuringElement( MORPH_RECT, Size(5,5), Point(2,2) ); + dilate( mask(cser.rect), mtmp(cser.rect), element ); + absdiff( mtmp(cser.rect), mask(cser.rect), mtmp(cser.rect) ); + + Scalar mean,std; + meanStdDev(grey(cser.rect), mean, std, mtmp(cser.rect) ); + + if (avg_intensity < mean[0]) + { + cser.level = (int)max; + cser.pixel = (max_loc.y+cser.rect.y)*grey.cols+max_loc.x+cser.rect.x; + cser.parent = &(mser_regions[0][0]); + mser_regions[0].push_back(cser); + } + else + { + cser.level = 255-(int)min; + cser.pixel = (min_loc.y+cser.rect.y)*grey.cols+min_loc.x+cser.rect.x; + cser.parent = &(mser_regions[1][0]); + mser_regions[1].push_back(cser); + } + + mask(cser.rect) = 0; + mtmp(cser.rect) = 0; + } +} + } }