You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
791 lines
28 KiB
791 lines
28 KiB
/*M/////////////////////////////////////////////////////////////////////////////////////// |
|
// |
|
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
|
// |
|
// By downloading, copying, installing or using the software you agree to this license. |
|
// If you do not agree to this license, do not download, install, |
|
// copy or use the software. |
|
// |
|
// |
|
// License Agreement |
|
// For Open Source Computer Vision Library |
|
// |
|
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. |
|
// Copyright (C) 2009, Willow Garage Inc., all rights reserved. |
|
// Third party copyrights are property of their respective owners. |
|
// |
|
// Redistribution and use in source and binary forms, with or without modification, |
|
// are permitted provided that the following conditions are met: |
|
// |
|
// * Redistribution's of source code must retain the above copyright notice, |
|
// this list of conditions and the following disclaimer. |
|
// |
|
// * Redistribution's in binary form must reproduce the above copyright notice, |
|
// this list of conditions and the following disclaimer in the documentation |
|
// and/or other materials provided with the distribution. |
|
// |
|
// * The name of the copyright holders may not be used to endorse or promote products |
|
// derived from this software without specific prior written permission. |
|
// |
|
// This software is provided by the copyright holders and contributors "as is" and |
|
// any express or implied warranties, including, but not limited to, the implied |
|
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
|
// In no event shall the Intel Corporation or contributors be liable for any direct, |
|
// indirect, incidental, special, exemplary, or consequential damages |
|
// (including, but not limited to, procurement of substitute goods or services; |
|
// loss of use, data, or profits; or business interruption) however caused |
|
// and on any theory of liability, whether in contract, strict liability, |
|
// or tort (including negligence or otherwise) arising in any way out of |
|
// the use of this software, even if advised of the possibility of such damage. |
|
// |
|
//M*/ |
|
|
|
#include "precomp.hpp" |
|
#include "opencv2/imgproc.hpp" |
|
#include "opencv2/ml.hpp" |
|
|
|
#include <iostream> |
|
#include <fstream> |
|
#include <set> |
|
|
|
namespace cv |
|
{ |
|
namespace text |
|
{ |
|
|
|
using namespace std; |
|
using namespace cv::ml; |
|
|
|
/* OCR BeamSearch Decoder */ |
|
|
|
void OCRBeamSearchDecoder::run(Mat& image, string& output_text, vector<Rect>* component_rects, |
|
vector<string>* component_texts, vector<float>* component_confidences, |
|
int component_level) |
|
{ |
|
CV_Assert( (image.type() == CV_8UC1) || (image.type() == CV_8UC3) ); |
|
CV_Assert( (component_level == OCR_LEVEL_TEXTLINE) || (component_level == OCR_LEVEL_WORD) ); |
|
output_text.clear(); |
|
if (component_rects != NULL) |
|
component_rects->clear(); |
|
if (component_texts != NULL) |
|
component_texts->clear(); |
|
if (component_confidences != NULL) |
|
component_confidences->clear(); |
|
} |
|
void OCRBeamSearchDecoder::run(Mat& image, Mat& mask, string& output_text, vector<Rect>* component_rects, |
|
vector<string>* component_texts, vector<float>* component_confidences, |
|
int component_level) |
|
{ |
|
CV_Assert(mask.type() == CV_8UC1); |
|
CV_Assert( (image.type() == CV_8UC1) || (image.type() == CV_8UC3) ); |
|
CV_Assert( (component_level == OCR_LEVEL_TEXTLINE) || (component_level == OCR_LEVEL_WORD) ); |
|
output_text.clear(); |
|
if (component_rects != NULL) |
|
component_rects->clear(); |
|
if (component_texts != NULL) |
|
component_texts->clear(); |
|
if (component_confidences != NULL) |
|
component_confidences->clear(); |
|
} |
|
|
|
CV_WRAP String OCRBeamSearchDecoder::run(InputArray image, int min_confidence, int component_level) |
|
{ |
|
std::string output1; |
|
std::string output2; |
|
vector<string> component_texts; |
|
vector<float> component_confidences; |
|
Mat image_m = image.getMat(); |
|
run(image_m, output1, NULL, &component_texts, &component_confidences, component_level); |
|
for(unsigned int i = 0; i < component_texts.size(); i++) |
|
{ |
|
//cout << "confidence: " << component_confidences[i] << " text:" << component_texts[i] << endl; |
|
if(component_confidences[i] > min_confidence) |
|
{ |
|
output2 += component_texts[i]; |
|
} |
|
} |
|
return String(output2); |
|
} |
|
|
|
CV_WRAP String OCRBeamSearchDecoder::run(InputArray image, InputArray mask, int min_confidence, int component_level) |
|
{ |
|
std::string output1; |
|
std::string output2; |
|
vector<string> component_texts; |
|
vector<float> component_confidences; |
|
Mat image_m = image.getMat(); |
|
Mat mask_m = mask.getMat(); |
|
run(image_m, mask_m, output1, NULL, &component_texts, &component_confidences, component_level); |
|
for(unsigned int i = 0; i < component_texts.size(); i++) |
|
{ |
|
//cout << "confidence: " << component_confidences[i] << " text:" << component_texts[i] << endl; |
|
if(component_confidences[i] > min_confidence) |
|
{ |
|
output2 += component_texts[i]; |
|
} |
|
} |
|
return String(output2); |
|
} |
|
|
|
|
|
void OCRBeamSearchDecoder::ClassifierCallback::eval( InputArray image, vector< vector<double> >& recognition_probabilities, vector<int>& oversegmentation) |
|
{ |
|
CV_Assert(( image.getMat().type() == CV_8UC3 ) || ( image.getMat().type() == CV_8UC1 )); |
|
if (!recognition_probabilities.empty()) |
|
{ |
|
for (size_t i=0; i<recognition_probabilities.size(); i++) |
|
recognition_probabilities[i].clear(); |
|
} |
|
recognition_probabilities.clear(); |
|
oversegmentation.clear(); |
|
} |
|
|
|
struct beamSearch_node { |
|
double score; |
|
vector<int> segmentation; |
|
bool expanded; |
|
// TODO calculating score of its childs would be much faster if we store the last column |
|
// of their "root" path. |
|
}; |
|
|
|
bool beam_sort_function ( beamSearch_node a, beamSearch_node b ); |
|
bool beam_sort_function ( beamSearch_node a, beamSearch_node b ) |
|
{ |
|
return (a.score > b.score); |
|
} |
|
|
|
|
|
class OCRBeamSearchDecoderImpl : public OCRBeamSearchDecoder |
|
{ |
|
public: |
|
//Default constructor |
|
OCRBeamSearchDecoderImpl( Ptr<OCRBeamSearchDecoder::ClassifierCallback> _classifier, |
|
const string& _vocabulary, |
|
InputArray transition_probabilities_table, |
|
InputArray emission_probabilities_table, |
|
decoder_mode _mode, |
|
int _beam_size) |
|
{ |
|
classifier = _classifier; |
|
step_size = classifier->getStepSize(); |
|
win_size = classifier->getWindowSize(); |
|
emission_p = emission_probabilities_table.getMat(); |
|
vocabulary = _vocabulary; |
|
mode = _mode; |
|
beam_size = _beam_size; |
|
transition_probabilities_table.getMat().copyTo(transition_p); |
|
for (int i=0; i<transition_p.rows; i++) |
|
{ |
|
for (int j=0; j<transition_p.cols; j++) |
|
{ |
|
if (transition_p.at<double>(i,j) == 0) |
|
transition_p.at<double>(i,j) = -DBL_MAX; |
|
else |
|
transition_p.at<double>(i,j) = log(transition_p.at<double>(i,j)); |
|
} |
|
} |
|
} |
|
|
|
~OCRBeamSearchDecoderImpl() |
|
{ |
|
} |
|
|
|
void run( Mat& src, |
|
Mat& mask, |
|
string& out_sequence, |
|
vector<Rect>* component_rects, |
|
vector<string>* component_texts, |
|
vector<float>* component_confidences, |
|
int component_level) |
|
{ |
|
CV_Assert(mask.type() == CV_8UC1); |
|
//nothing to do with a mask here |
|
run( src, out_sequence, component_rects, component_texts, component_confidences, |
|
component_level); |
|
} |
|
|
|
void run( Mat& src, |
|
string& out_sequence, |
|
vector<Rect>* component_rects, |
|
vector<string>* component_texts, |
|
vector<float>* component_confidences, |
|
int component_level) |
|
{ |
|
|
|
CV_Assert( (src.type() == CV_8UC1) || (src.type() == CV_8UC3) ); |
|
CV_Assert( (src.cols > 0) && (src.rows > 0) ); |
|
CV_Assert( component_level == OCR_LEVEL_WORD ); |
|
out_sequence.clear(); |
|
if (component_rects != NULL) |
|
component_rects->clear(); |
|
if (component_texts != NULL) |
|
component_texts->clear(); |
|
if (component_confidences != NULL) |
|
component_confidences->clear(); |
|
|
|
if(src.type() == CV_8UC3) |
|
{ |
|
cvtColor(src,src,COLOR_RGB2GRAY); |
|
} |
|
|
|
|
|
// TODO if input is a text line (not a word) we may need to split into words here! |
|
|
|
// do sliding window classification along a croped word image |
|
classifier->eval(src, recognition_probabilities, oversegmentation); |
|
|
|
// if the number of oversegmentation points found is less than 2 we can not do nothing!! |
|
if (oversegmentation.size() < 2) return; |
|
|
|
|
|
//NMS of recognitions |
|
double last_best_p = 0; |
|
int last_best_idx = -1; |
|
for (size_t i=0; i<recognition_probabilities.size(); ) |
|
{ |
|
double best_p = 0; |
|
int best_idx = -1; |
|
for (size_t j=0; j<recognition_probabilities[i].size(); j++) |
|
{ |
|
if (recognition_probabilities[i][j] > best_p) |
|
{ |
|
best_p = recognition_probabilities[i][j]; |
|
best_idx = (int)j; |
|
} |
|
} |
|
|
|
if ((i>0) && (best_idx == last_best_idx) |
|
&& (oversegmentation[i]*step_size < oversegmentation[i-1]*step_size + win_size) ) |
|
{ |
|
if (last_best_p > best_p) |
|
{ |
|
//remove i'th elements and do not increment i |
|
recognition_probabilities.erase (recognition_probabilities.begin()+i); |
|
oversegmentation.erase (oversegmentation.begin()+i); |
|
continue; |
|
} else { |
|
//remove (i-1)'th elements and do not increment i |
|
recognition_probabilities.erase (recognition_probabilities.begin()+i-1); |
|
oversegmentation.erase (oversegmentation.begin()+i-1); |
|
last_best_idx = best_idx; |
|
last_best_p = best_p; |
|
continue; |
|
} |
|
} |
|
|
|
last_best_idx = best_idx; |
|
last_best_p = best_p; |
|
i++; |
|
} |
|
|
|
/*Now we go with the beam search algorithm to optimize the recognition score*/ |
|
|
|
//convert probabilities to log probabilities |
|
for (size_t i=0; i<recognition_probabilities.size(); i++) |
|
{ |
|
for (size_t j=0; j<recognition_probabilities[i].size(); j++) |
|
{ |
|
if (recognition_probabilities[i][j] == 0) |
|
recognition_probabilities[i][j] = -DBL_MAX; |
|
else |
|
recognition_probabilities[i][j] = log(recognition_probabilities[i][j]); |
|
} |
|
} |
|
|
|
// initialize the beam with all possible character's pairs |
|
int generated_chids = 0; |
|
for (size_t i=0; i<recognition_probabilities.size()-1; i++) |
|
{ |
|
for (size_t j=i+1; j<recognition_probabilities.size(); j++) |
|
{ |
|
|
|
beamSearch_node node; |
|
node.segmentation.push_back((int)i); |
|
node.segmentation.push_back((int)j); |
|
node.score = score_segmentation(node.segmentation, out_sequence); |
|
vector< vector<int> > childs = generate_childs( node.segmentation ); |
|
node.expanded = true; |
|
|
|
beam.push_back( node ); |
|
|
|
if (!childs.empty()) |
|
update_beam( childs ); |
|
|
|
generated_chids += (int)childs.size(); |
|
|
|
} |
|
} |
|
|
|
while (generated_chids != 0) |
|
{ |
|
generated_chids = 0; |
|
|
|
for (size_t i=0; i<beam.size(); i++) |
|
{ |
|
vector< vector<int> > childs; |
|
if (!beam[i].expanded) |
|
{ |
|
childs = generate_childs( beam[i].segmentation ); |
|
beam[i].expanded = true; |
|
} |
|
if (!childs.empty()) |
|
update_beam( childs ); |
|
generated_chids += (int)childs.size(); |
|
} |
|
} |
|
|
|
// Done! Get the best prediction found into out_sequence |
|
double lp = score_segmentation( beam[0].segmentation, out_sequence ); |
|
|
|
// fill other (dummy) output parameters |
|
component_rects->push_back(Rect(0,0,src.cols,src.rows)); |
|
component_texts->push_back(out_sequence); |
|
component_confidences->push_back((float)exp(lp)); |
|
|
|
return; |
|
} |
|
|
|
private: |
|
int win_size; |
|
int step_size; |
|
|
|
vector< beamSearch_node > beam; |
|
vector< vector<double> > recognition_probabilities; |
|
vector<int> oversegmentation; |
|
|
|
vector< vector<int> > generate_childs( vector<int> &segmentation ) |
|
{ |
|
|
|
vector< vector<int> > childs; |
|
for (size_t i=segmentation[segmentation.size()-1]+1; i<oversegmentation.size(); i++) |
|
{ |
|
int seg_point = (int)i; |
|
if (find(segmentation.begin(), segmentation.end(), seg_point) == segmentation.end()) |
|
{ |
|
vector<int> child = segmentation; |
|
child.push_back(seg_point); |
|
childs.push_back(child); |
|
} |
|
} |
|
return childs; |
|
} |
|
|
|
void update_beam ( vector< vector<int> > &childs ) |
|
{ |
|
string out_sequence; |
|
double min_score = -DBL_MAX; //min score value to be part of the beam |
|
if ((int)beam.size() >= beam_size) |
|
min_score = beam[beam_size-1].score; //last element has the lowest score |
|
|
|
for (size_t i=0; i<childs.size(); i++) |
|
{ |
|
double score = score_segmentation(childs[i], out_sequence); |
|
if (score > min_score) |
|
{ |
|
beamSearch_node node; |
|
node.score = score; |
|
node.segmentation = childs[i]; |
|
node.expanded = false; |
|
beam.push_back(node); |
|
sort(beam.begin(),beam.end(),beam_sort_function); |
|
if ((int)beam.size() > beam_size) |
|
{ |
|
beam.erase(beam.begin()+beam_size,beam.end()); |
|
min_score = beam[beam.size()-1].score; |
|
} |
|
} |
|
} |
|
} |
|
|
|
|
|
double score_segmentation( vector<int> &segmentation, string& outstring ) |
|
{ |
|
|
|
// Score Heuristics: |
|
// No need to use Viterbi to know a given segmentation is bad |
|
// e.g.: in some cases we discard a segmentation because it includes a very large character |
|
// in other cases we do it because the overlapping between two chars is too large |
|
// TODO Add more heuristics (e.g. penalize large inter-character variance) |
|
|
|
Mat interdist ((int)segmentation.size()-1, 1, CV_32F, 1); |
|
for (size_t i=0; i<segmentation.size()-1; i++) |
|
{ |
|
interdist.at<float>((int)i,0) = (float)oversegmentation[segmentation[(int)i+1]]*step_size |
|
- (float)oversegmentation[segmentation[(int)i]]*step_size; |
|
if ((float)interdist.at<float>((int)i,0)/win_size > 2.25) // TODO explain how did you set this thrs |
|
{ |
|
return -DBL_MAX; |
|
} |
|
if ((float)interdist.at<float>((int)i,0)/win_size < 0.15) // TODO explain how did you set this thrs |
|
{ |
|
return -DBL_MAX; |
|
} |
|
} |
|
Scalar m, std; |
|
meanStdDev(interdist, m, std); |
|
//double interdist_std = std[0]; |
|
|
|
//TODO Extracting start probs from lexicon (if we have it) may boost accuracy! |
|
vector<double> start_p(vocabulary.size()); |
|
for (int i=0; i<(int)vocabulary.size(); i++) |
|
start_p[i] = log(1.0/vocabulary.size()); |
|
|
|
|
|
Mat V = Mat::ones((int)segmentation.size(),(int)vocabulary.size(),CV_64FC1); |
|
V = V * -DBL_MAX; |
|
vector<string> path(vocabulary.size()); |
|
|
|
// Initialize base cases (t == 0) |
|
for (int i=0; i<(int)vocabulary.size(); i++) |
|
{ |
|
V.at<double>(0,i) = start_p[i] + recognition_probabilities[segmentation[0]][i]; |
|
path[i] = vocabulary.at(i); |
|
} |
|
|
|
|
|
// Run Viterbi for t > 0 |
|
for (int t=1; t<(int)segmentation.size(); t++) |
|
{ |
|
|
|
vector<string> newpath(vocabulary.size()); |
|
|
|
for (int i=0; i<(int)vocabulary.size(); i++) |
|
{ |
|
double max_prob = -DBL_MAX; |
|
int best_idx = 0; |
|
for (int j=0; j<(int)vocabulary.size(); j++) |
|
{ |
|
double prob = V.at<double>(t-1,j) + transition_p.at<double>(j,i) + recognition_probabilities[segmentation[t]][i]; |
|
if ( prob > max_prob) |
|
{ |
|
max_prob = prob; |
|
best_idx = j; |
|
} |
|
} |
|
|
|
V.at<double>(t,i) = max_prob; |
|
newpath[i] = path[best_idx] + vocabulary.at(i); |
|
} |
|
|
|
// Don't need to remember the old paths |
|
path.swap(newpath); |
|
} |
|
|
|
double max_prob = -DBL_MAX; |
|
int best_idx = 0; |
|
for (int i=0; i<(int)vocabulary.size(); i++) |
|
{ |
|
double prob = V.at<double>((int)segmentation.size()-1,i); |
|
if ( prob > max_prob) |
|
{ |
|
max_prob = prob; |
|
best_idx = i; |
|
} |
|
} |
|
|
|
outstring = path[best_idx]; |
|
return (max_prob / (segmentation.size()-1)); |
|
} |
|
|
|
}; |
|
|
|
Ptr<OCRBeamSearchDecoder> OCRBeamSearchDecoder::create( Ptr<OCRBeamSearchDecoder::ClassifierCallback> _classifier, |
|
const string& _vocabulary, |
|
InputArray transition_p, |
|
InputArray emission_p, |
|
decoder_mode _mode, |
|
int _beam_size) |
|
{ |
|
return makePtr<OCRBeamSearchDecoderImpl>(_classifier, _vocabulary, transition_p, emission_p, _mode, _beam_size); |
|
} |
|
|
|
CV_EXPORTS_W Ptr<OCRBeamSearchDecoder> OCRBeamSearchDecoder::create(Ptr<OCRBeamSearchDecoder::ClassifierCallback> _classifier, |
|
const String& _vocabulary, |
|
InputArray transition_p, |
|
InputArray emission_p, |
|
int _mode, |
|
int _beam_size) |
|
{ |
|
return makePtr<OCRBeamSearchDecoderImpl>(_classifier, _vocabulary, transition_p, emission_p, (decoder_mode)_mode, _beam_size); |
|
} |
|
|
|
|
|
class CV_EXPORTS OCRBeamSearchClassifierCNN : public OCRBeamSearchDecoder::ClassifierCallback |
|
{ |
|
public: |
|
//constructor |
|
OCRBeamSearchClassifierCNN(const std::string& filename); |
|
// Destructor |
|
~OCRBeamSearchClassifierCNN() {} |
|
|
|
void eval( InputArray src, vector< vector<double> >& recognition_probabilities, vector<int>& oversegmentation ); |
|
|
|
int getWindowSize() {return window_size;} |
|
int getStepSize() {return step_size;} |
|
void setStepSize(int _step_size) {step_size = _step_size;} |
|
|
|
protected: |
|
void normalizeAndZCA(Mat& patches); |
|
double eval_feature(Mat& feature, double* prob_estimates); |
|
|
|
private: |
|
int window_size; // window size |
|
int step_size; // sliding window step |
|
int nr_class; // number of classes |
|
int nr_feature; // number of features |
|
Mat feature_min; // scale range |
|
Mat feature_max; |
|
Mat weights; // Logistic Regression weights |
|
Mat kernels; // CNN kernels |
|
Mat M, P; // ZCA Whitening parameters |
|
int quad_size; |
|
int patch_size; |
|
int num_quads; // extract 25 quads (12x12) from each image |
|
int num_tiles; // extract 25 patches (8x8) from each quad |
|
double alpha; // used in non-linear activation function z = max(0, |D*a| - alpha) |
|
}; |
|
|
|
OCRBeamSearchClassifierCNN::OCRBeamSearchClassifierCNN (const string& filename) |
|
{ |
|
if (ifstream(filename.c_str())) |
|
{ |
|
FileStorage fs(filename, FileStorage::READ); |
|
// Load kernels bank and withenning params |
|
fs["kernels"] >> kernels; |
|
fs["M"] >> M; |
|
fs["P"] >> P; |
|
// Load Logistic Regression weights |
|
fs["weights"] >> weights; |
|
// Load feature scaling ranges |
|
fs["feature_min"] >> feature_min; |
|
fs["feature_max"] >> feature_max; |
|
fs.release(); |
|
} |
|
else |
|
CV_Error(Error::StsBadArg, "Default classifier data file not found!"); |
|
|
|
nr_feature = weights.rows; |
|
nr_class = weights.cols; |
|
patch_size = cvRound(sqrt((float)kernels.cols)); |
|
window_size = 4*patch_size; |
|
step_size = 4; |
|
quad_size = 12; |
|
num_quads = 25; |
|
num_tiles = 25; |
|
alpha = 0.5; // used in non-linear activation function z = max(0, |D*a| - alpha) |
|
} |
|
|
|
void OCRBeamSearchClassifierCNN::eval( InputArray _src, vector< vector<double> >& recognition_probabilities, vector<int>& oversegmentation) |
|
{ |
|
|
|
CV_Assert(( _src.getMat().type() == CV_8UC3 ) || ( _src.getMat().type() == CV_8UC1 )); |
|
if (!recognition_probabilities.empty()) |
|
{ |
|
for (size_t i=0; i<recognition_probabilities.size(); i++) |
|
recognition_probabilities[i].clear(); |
|
} |
|
recognition_probabilities.clear(); |
|
oversegmentation.clear(); |
|
|
|
|
|
Mat src = _src.getMat(); |
|
if(src.type() == CV_8UC3) |
|
{ |
|
cvtColor(src,src,COLOR_RGB2GRAY); |
|
} |
|
|
|
resize(src,src,Size(window_size*src.cols/src.rows,window_size)); |
|
|
|
int seg_points = 0; |
|
|
|
Mat quad; |
|
Mat tmp; |
|
Mat img; |
|
|
|
int sz = src.cols - window_size; |
|
int sz_window_quad = window_size - quad_size; |
|
int sz_half_quad = (int)(quad_size/2-1); |
|
int sz_quad_patch = quad_size - patch_size; |
|
// begin sliding window loop foreach detection window |
|
for (int x_c = 0; x_c <= sz; x_c += step_size) |
|
{ |
|
|
|
img = src(Rect(Point(x_c,0),Size(window_size,window_size))); |
|
|
|
int patch_count = 0; |
|
vector< vector<double> > data_pool(9); |
|
|
|
|
|
int quad_id = 1; |
|
|
|
for (int q_x = 0; q_x <= sz_window_quad; q_x += sz_half_quad) |
|
{ |
|
for (int q_y = 0; q_y <= sz_window_quad; q_y += sz_half_quad) |
|
{ |
|
Rect quad_rect = Rect(q_x,q_y,quad_size,quad_size); |
|
quad = img(quad_rect); |
|
|
|
//start sliding window (8x8) in each tile and store the patch as row in data_pool |
|
for (int w_x = 0; w_x <= sz_quad_patch; w_x++) |
|
{ |
|
for (int w_y = 0; w_y <= sz_quad_patch; w_y++) |
|
{ |
|
quad(Rect(w_x,w_y,patch_size,patch_size)).convertTo(tmp, CV_64F); |
|
tmp = tmp.reshape(0,1); |
|
normalizeAndZCA(tmp); |
|
vector<double> patch; |
|
tmp.copyTo(patch); |
|
if ((quad_id == 1)||(quad_id == 2)||(quad_id == 6)||(quad_id == 7)) |
|
data_pool[0].insert(data_pool[0].end(),patch.begin(),patch.end()); |
|
if ((quad_id == 2)||(quad_id == 7)||(quad_id == 3)||(quad_id == 8)||(quad_id == 4)||(quad_id == 9)) |
|
data_pool[1].insert(data_pool[1].end(),patch.begin(),patch.end()); |
|
if ((quad_id == 4)||(quad_id == 9)||(quad_id == 5)||(quad_id == 10)) |
|
data_pool[2].insert(data_pool[2].end(),patch.begin(),patch.end()); |
|
if ((quad_id == 6)||(quad_id == 11)||(quad_id == 16)||(quad_id == 7)||(quad_id == 12)||(quad_id == 17)) |
|
data_pool[3].insert(data_pool[3].end(),patch.begin(),patch.end()); |
|
if ((quad_id == 7)||(quad_id == 12)||(quad_id == 17)||(quad_id == 8)||(quad_id == 13)||(quad_id == 18)||(quad_id == 9)||(quad_id == 14)||(quad_id == 19)) |
|
data_pool[4].insert(data_pool[4].end(),patch.begin(),patch.end()); |
|
if ((quad_id == 9)||(quad_id == 14)||(quad_id == 19)||(quad_id == 10)||(quad_id == 15)||(quad_id == 20)) |
|
data_pool[5].insert(data_pool[5].end(),patch.begin(),patch.end()); |
|
if ((quad_id == 16)||(quad_id == 21)||(quad_id == 17)||(quad_id == 22)) |
|
data_pool[6].insert(data_pool[6].end(),patch.begin(),patch.end()); |
|
if ((quad_id == 17)||(quad_id == 22)||(quad_id == 18)||(quad_id == 23)||(quad_id == 19)||(quad_id == 24)) |
|
data_pool[7].insert(data_pool[7].end(),patch.begin(),patch.end()); |
|
if ((quad_id == 19)||(quad_id == 24)||(quad_id == 20)||(quad_id == 25)) |
|
data_pool[8].insert(data_pool[8].end(),patch.begin(),patch.end()); |
|
patch_count++; |
|
} |
|
} |
|
|
|
quad_id++; |
|
} |
|
} |
|
|
|
|
|
//do dot product of each normalized and whitened patch |
|
//each pool is averaged and this yields a representation of 9xD |
|
Mat feature = Mat::zeros(9,kernels.rows,CV_64FC1); |
|
for (int i=0; i<9; i++) |
|
{ |
|
Mat pool = Mat(data_pool[i]); |
|
pool = pool.reshape(0,(int)data_pool[i].size()/kernels.cols); |
|
for (int p=0; p<pool.rows; p++) |
|
{ |
|
for (int f=0; f<kernels.rows; f++) |
|
{ |
|
feature.row(i).at<double>(0,f) = feature.row(i).at<double>(0,f) + max(0.0,std::abs(pool.row(p).dot(kernels.row(f)))-alpha); |
|
} |
|
} |
|
} |
|
feature = feature.reshape(0,1); |
|
|
|
|
|
// data must be normalized within the range obtained during training |
|
double lower = -1.0; |
|
double upper = 1.0; |
|
for (int k=0; k<feature.cols; k++) |
|
{ |
|
feature.at<double>(0,k) = lower + (upper-lower) * |
|
(feature.at<double>(0,k)-feature_min.at<double>(0,k))/ |
|
(feature_max.at<double>(0,k)-feature_min.at<double>(0,k)); |
|
} |
|
|
|
double *p = new double[nr_class]; |
|
double predict_label = eval_feature(feature,p); |
|
|
|
if ( (predict_label < 0) || (predict_label > nr_class) ) |
|
CV_Error(Error::StsOutOfRange, "OCRBeamSearchClassifierCNN::eval Error: unexpected prediction in eval_feature()"); |
|
|
|
|
|
vector<double> recognition_p(p, p+nr_class); |
|
recognition_probabilities.push_back(recognition_p); |
|
oversegmentation.push_back(seg_points); |
|
seg_points++; |
|
} |
|
|
|
} |
|
|
|
// normalize for contrast and apply ZCA whitening to a set of image patches |
|
void OCRBeamSearchClassifierCNN::normalizeAndZCA(Mat& patches) |
|
{ |
|
|
|
//Normalize for contrast |
|
for (int i=0; i<patches.rows; i++) |
|
{ |
|
Scalar row_mean, row_std; |
|
meanStdDev(patches.row(i),row_mean,row_std); |
|
row_std[0] = sqrt(pow(row_std[0],2)*patches.cols/(patches.cols-1)+10); |
|
patches.row(i) = (patches.row(i) - row_mean[0]) / row_std[0]; |
|
} |
|
|
|
|
|
//ZCA whitening |
|
if ((M.dims == 0) || (P.dims == 0)) |
|
{ |
|
Mat CC; |
|
calcCovarMatrix(patches,CC,M,COVAR_NORMAL|COVAR_ROWS|COVAR_SCALE); |
|
CC = CC * patches.rows / (patches.rows-1); |
|
|
|
|
|
Mat e_val,e_vec; |
|
eigen(CC.t(),e_val,e_vec); |
|
e_vec = e_vec.t(); |
|
sqrt(1./(e_val + 0.1), e_val); |
|
|
|
|
|
Mat V = Mat::zeros(e_vec.rows, e_vec.cols, CV_64FC1); |
|
Mat D = Mat::eye(e_vec.rows, e_vec.cols, CV_64FC1); |
|
|
|
for (int i=0; i<e_vec.cols; i++) |
|
{ |
|
e_vec.col(e_vec.cols-i-1).copyTo(V.col(i)); |
|
D.col(i) = D.col(i) * e_val.at<double>(0,e_val.rows-i-1); |
|
} |
|
|
|
P = V * D * V.t(); |
|
} |
|
|
|
for (int i=0; i<patches.rows; i++) |
|
patches.row(i) = patches.row(i) - M; |
|
|
|
patches = patches * P; |
|
|
|
} |
|
|
|
double OCRBeamSearchClassifierCNN::eval_feature(Mat& feature, double* prob_estimates) |
|
{ |
|
for(int i=0;i<nr_class;i++) |
|
prob_estimates[i] = 0; |
|
|
|
for(int idx=0; idx<nr_feature; idx++) |
|
for(int i=0;i<nr_class;i++) |
|
prob_estimates[i] += weights.at<float>(idx,i)*feature.at<double>(0,idx); //TODO use vectorized dot product |
|
|
|
int dec_max_idx = 0; |
|
for(int i=1;i<nr_class;i++) |
|
{ |
|
if(prob_estimates[i] > prob_estimates[dec_max_idx]) |
|
dec_max_idx = i; |
|
} |
|
|
|
for(int i=0;i<nr_class;i++) |
|
prob_estimates[i]=1/(1+exp(-prob_estimates[i])); |
|
|
|
double sum=0; |
|
for(int i=0; i<nr_class; i++) |
|
sum+=prob_estimates[i]; |
|
|
|
for(int i=0; i<nr_class; i++) |
|
prob_estimates[i]=prob_estimates[i]/sum; |
|
|
|
return dec_max_idx; |
|
} |
|
|
|
Ptr<OCRBeamSearchDecoder::ClassifierCallback> loadOCRBeamSearchClassifierCNN(const String& filename) |
|
|
|
{ |
|
return makePtr<OCRBeamSearchClassifierCNN>(std::string(filename)); |
|
} |
|
|
|
} |
|
}
|
|
|