Adds a first implementation of the OCRBeamSearchDecoder class using the Single Layer CNN character classifier described in Coates, Adam, et al. paper: Text detection and character recognition in scene images with unsupervised feature learning, ICDAR 2011
parent
c05a7e0182
commit
52cca0ddc6
2 changed files with 768 additions and 0 deletions
@ -0,0 +1,657 @@ |
|||||||
|
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//
|
||||||
|
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||||
|
//
|
||||||
|
// By downloading, copying, installing or using the software you agree to this license.
|
||||||
|
// If you do not agree to this license, do not download, install,
|
||||||
|
// copy or use the software.
|
||||||
|
//
|
||||||
|
//
|
||||||
|
// License Agreement
|
||||||
|
// For Open Source Computer Vision Library
|
||||||
|
//
|
||||||
|
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||||
|
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||||
|
// Third party copyrights are property of their respective owners.
|
||||||
|
//
|
||||||
|
// Redistribution and use in source and binary forms, with or without modification,
|
||||||
|
// are permitted provided that the following conditions are met:
|
||||||
|
//
|
||||||
|
// * Redistribution's of source code must retain the above copyright notice,
|
||||||
|
// this list of conditions and the following disclaimer.
|
||||||
|
//
|
||||||
|
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||||
|
// this list of conditions and the following disclaimer in the documentation
|
||||||
|
// and/or other materials provided with the distribution.
|
||||||
|
//
|
||||||
|
// * The name of the copyright holders may not be used to endorse or promote products
|
||||||
|
// derived from this software without specific prior written permission.
|
||||||
|
//
|
||||||
|
// This software is provided by the copyright holders and contributors "as is" and
|
||||||
|
// any express or implied warranties, including, but not limited to, the implied
|
||||||
|
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||||
|
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||||
|
// indirect, incidental, special, exemplary, or consequential damages
|
||||||
|
// (including, but not limited to, procurement of substitute goods or services;
|
||||||
|
// loss of use, data, or profits; or business interruption) however caused
|
||||||
|
// and on any theory of liability, whether in contract, strict liability,
|
||||||
|
// or tort (including negligence or otherwise) arising in any way out of
|
||||||
|
// the use of this software, even if advised of the possibility of such damage.
|
||||||
|
//
|
||||||
|
//M*/
|
||||||
|
|
||||||
|
#include "precomp.hpp" |
||||||
|
#include "opencv2/imgproc.hpp" |
||||||
|
#include "opencv2/ml.hpp" |
||||||
|
|
||||||
|
#include <iostream> |
||||||
|
#include <fstream> |
||||||
|
#include <set> |
||||||
|
|
||||||
|
namespace cv |
||||||
|
{ |
||||||
|
namespace text |
||||||
|
{ |
||||||
|
|
||||||
|
using namespace std; |
||||||
|
using namespace cv::ml; |
||||||
|
|
||||||
|
/* OCR BeamSearch Decoder */ |
||||||
|
|
||||||
|
void OCRBeamSearchDecoder::run(Mat& image, string& output_text, vector<Rect>* component_rects, |
||||||
|
vector<string>* component_texts, vector<float>* component_confidences, |
||||||
|
int component_level) |
||||||
|
{ |
||||||
|
CV_Assert( (image.type() == CV_8UC1) || (image.type() == CV_8UC3) ); |
||||||
|
CV_Assert( (component_level == OCR_LEVEL_TEXTLINE) || (component_level == OCR_LEVEL_WORD) ); |
||||||
|
output_text.clear(); |
||||||
|
if (component_rects != NULL) |
||||||
|
component_rects->clear(); |
||||||
|
if (component_texts != NULL) |
||||||
|
component_texts->clear(); |
||||||
|
if (component_confidences != NULL) |
||||||
|
component_confidences->clear(); |
||||||
|
} |
||||||
|
|
||||||
|
|
||||||
|
void OCRBeamSearchDecoder::ClassifierCallback::eval( InputArray image, vector< vector<double> >& recognition_probabilities, vector<int>& oversegmentation) |
||||||
|
{ |
||||||
|
CV_Assert(( image.getMat().type() == CV_8UC3 ) || ( image.getMat().type() == CV_8UC1 )); |
||||||
|
if (!recognition_probabilities.empty()) |
||||||
|
{ |
||||||
|
for (size_t i=0; i<recognition_probabilities.size(); i++) |
||||||
|
recognition_probabilities[i].clear(); |
||||||
|
} |
||||||
|
recognition_probabilities.clear(); |
||||||
|
oversegmentation.clear(); |
||||||
|
} |
||||||
|
|
||||||
|
|
||||||
|
bool beam_sort_function ( pair< double,vector<int> > i, pair< double,vector<int> > j ); |
||||||
|
bool beam_sort_function ( pair< double,vector<int> > i, pair< double,vector<int> > j ) |
||||||
|
{ |
||||||
|
return (i.first > j.first); |
||||||
|
} |
||||||
|
|
||||||
|
|
||||||
|
class OCRBeamSearchDecoderImpl : public OCRBeamSearchDecoder |
||||||
|
{ |
||||||
|
public: |
||||||
|
//Default constructor
|
||||||
|
OCRBeamSearchDecoderImpl( Ptr<OCRBeamSearchDecoder::ClassifierCallback> _classifier, |
||||||
|
const string& _vocabulary, |
||||||
|
InputArray transition_probabilities_table, |
||||||
|
InputArray emission_probabilities_table, |
||||||
|
decoder_mode _mode, |
||||||
|
int _beam_size) |
||||||
|
{ |
||||||
|
classifier = _classifier; |
||||||
|
transition_p = transition_probabilities_table.getMat(); |
||||||
|
emission_p = emission_probabilities_table.getMat(); |
||||||
|
vocabulary = _vocabulary; |
||||||
|
mode = _mode; |
||||||
|
beam_size = _beam_size; |
||||||
|
} |
||||||
|
|
||||||
|
~OCRBeamSearchDecoderImpl() |
||||||
|
{ |
||||||
|
} |
||||||
|
|
||||||
|
void run( Mat& src, |
||||||
|
string& out_sequence, |
||||||
|
vector<Rect>* component_rects, |
||||||
|
vector<string>* component_texts, |
||||||
|
vector<float>* component_confidences, |
||||||
|
int component_level) |
||||||
|
{ |
||||||
|
|
||||||
|
CV_Assert( (src.type() == CV_8UC1) || (src.type() == CV_8UC3) ); |
||||||
|
CV_Assert( (src.cols > 0) && (src.rows > 0) ); |
||||||
|
CV_Assert( component_level == OCR_LEVEL_WORD ); |
||||||
|
out_sequence.clear(); |
||||||
|
if (component_rects != NULL) |
||||||
|
component_rects->clear(); |
||||||
|
if (component_texts != NULL) |
||||||
|
component_texts->clear(); |
||||||
|
if (component_confidences != NULL) |
||||||
|
component_confidences->clear(); |
||||||
|
|
||||||
|
// TODO split a line into words
|
||||||
|
|
||||||
|
if(src.type() == CV_8UC3) |
||||||
|
{ |
||||||
|
cvtColor(src,src,COLOR_RGB2GRAY); |
||||||
|
} |
||||||
|
|
||||||
|
|
||||||
|
vector< vector<double> > recognition_probabilities; |
||||||
|
vector<int> oversegmentation; |
||||||
|
|
||||||
|
classifier->eval(src, recognition_probabilities, oversegmentation); |
||||||
|
|
||||||
|
/*Now we go here with the beam search algorithm to optimize the recognition score*/ |
||||||
|
|
||||||
|
//convert probabilities to log probabilities
|
||||||
|
for (size_t i=0; i<recognition_probabilities.size(); i++) |
||||||
|
{ |
||||||
|
for (size_t j=0; j<recognition_probabilities[i].size(); j++) |
||||||
|
{ |
||||||
|
if (recognition_probabilities[i][j] == 0) |
||||||
|
recognition_probabilities[i][j] = -DBL_MAX; |
||||||
|
else |
||||||
|
recognition_probabilities[i][j] = log(recognition_probabilities[i][j]); |
||||||
|
} |
||||||
|
} |
||||||
|
for (int i=0; i<transition_p.rows; i++) |
||||||
|
{ |
||||||
|
for (int j=0; j<transition_p.cols; j++) |
||||||
|
{ |
||||||
|
if (transition_p.at<double>(i,j) == 0) |
||||||
|
transition_p.at<double>(i,j) = -DBL_MAX; |
||||||
|
else |
||||||
|
transition_p.at<double>(i,j) = log(transition_p.at<double>(i,j)); |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
|
||||||
|
//TODO it would be interesting to have a hash table with a vector of booleans
|
||||||
|
// but this is not possible when we have a large number of possible segmentations.
|
||||||
|
//vector<bool> visited_nodes(pow(2,oversegmentation.size()),false); // hash table for visited nodes
|
||||||
|
// options are using std::set<unsigned long long int> to store only the keys of visited nodes
|
||||||
|
// but will deteriorate the time performance.
|
||||||
|
set<unsigned long long int> visited_nodes; //TODO make it member of class
|
||||||
|
// it is also possible to reduce the number of seg. points in some way (e.g. use only seg.points
|
||||||
|
// for which there is a change on the class prediction)
|
||||||
|
|
||||||
|
vector<int> start_segmentation; |
||||||
|
start_segmentation.push_back(oversegmentation[0]); |
||||||
|
start_segmentation.push_back(oversegmentation[oversegmentation.size()-1]); |
||||||
|
|
||||||
|
vector< pair< double,vector<int> > > beam; |
||||||
|
beam.push_back( pair< double,vector<int> > (score_segmentation(start_segmentation, recognition_probabilities, out_sequence), start_segmentation) ); |
||||||
|
|
||||||
|
vector< vector<int> > childs = generate_childs(start_segmentation,oversegmentation, visited_nodes); |
||||||
|
if (!childs.empty()) |
||||||
|
update_beam( beam, childs, recognition_probabilities); |
||||||
|
//cout << "beam size " << beam.size() << " best score " << beam[0].first<< endl;
|
||||||
|
|
||||||
|
int generated_chids = childs.size(); |
||||||
|
while (generated_chids != 0) |
||||||
|
{ |
||||||
|
generated_chids = 0; |
||||||
|
vector< pair< double,vector<int> > > old_beam = beam; |
||||||
|
|
||||||
|
for (size_t i=0; i<old_beam.size(); i++) |
||||||
|
{ |
||||||
|
childs = generate_childs(old_beam[i].second,oversegmentation, visited_nodes); |
||||||
|
if (!childs.empty()) |
||||||
|
update_beam( beam, childs, recognition_probabilities); |
||||||
|
generated_chids += childs.size(); |
||||||
|
} |
||||||
|
//cout << "beam size " << beam.size() << " best score " << beam[0].first << endl;
|
||||||
|
} |
||||||
|
|
||||||
|
|
||||||
|
// FINISHED ! Get the best prediction found into out_sequence
|
||||||
|
score_segmentation(beam[0].second, recognition_probabilities, out_sequence); |
||||||
|
|
||||||
|
|
||||||
|
// TODO fill other output parameters
|
||||||
|
|
||||||
|
return; |
||||||
|
} |
||||||
|
|
||||||
|
private: |
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
// TODO the way we expand nodes makes the recognition score heuristic not monotonic
|
||||||
|
// it should start from left node 0 and grow always to the right.
|
||||||
|
|
||||||
|
vector< vector<int> > generate_childs(vector<int> &segmentation, vector<int> &oversegmentation, set<unsigned long long int> &visited_nodes) |
||||||
|
{ |
||||||
|
/*cout << " generate childs for [";
|
||||||
|
for (size_t i = 0 ; i < segmentation .size(); i++) |
||||||
|
cout << segmentation[i] << ","; |
||||||
|
cout << "] ";*/ |
||||||
|
|
||||||
|
vector< vector<int> > childs; |
||||||
|
for (size_t i=0; i<oversegmentation.size(); i++) |
||||||
|
{ |
||||||
|
int seg_point = oversegmentation[i]; |
||||||
|
if (find(segmentation.begin(), segmentation.end(), seg_point) == segmentation.end()) |
||||||
|
{ |
||||||
|
//cout << seg_point << " " ;
|
||||||
|
vector<int> child = segmentation; |
||||||
|
child.push_back(seg_point); |
||||||
|
sort(child.begin(), child.end()); |
||||||
|
unsigned long long int key = 0; |
||||||
|
for (size_t j=0; j<child.size(); j++) |
||||||
|
{ |
||||||
|
key += pow(2,oversegmentation.size()-(oversegmentation.end()-find(oversegmentation.begin(), oversegmentation.end(), child[j]))); |
||||||
|
} |
||||||
|
//if (!visited_nodes[key])
|
||||||
|
if (visited_nodes.find(key) == visited_nodes.end()) |
||||||
|
{ |
||||||
|
childs.push_back(child); |
||||||
|
//visited_nodes[key] = true;
|
||||||
|
visited_nodes.insert(key); |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
//cout << endl;
|
||||||
|
return childs; |
||||||
|
} |
||||||
|
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
//TODO shall the beam itself be a member of the class?
|
||||||
|
void update_beam (vector< pair< double,vector<int> > > &beam, vector< vector<int> > &childs, vector< vector<double> > &recognition_probabilities) |
||||||
|
{ |
||||||
|
string out_sequence; |
||||||
|
double min_score = -DBL_MAX; //min score value to be part of the beam
|
||||||
|
if ((int)beam.size() == beam_size) |
||||||
|
min_score = beam[beam.size()-1].first; //last element has the lowest score
|
||||||
|
for (size_t i=0; i<childs.size(); i++) |
||||||
|
{ |
||||||
|
double score = score_segmentation(childs[i], recognition_probabilities, out_sequence); |
||||||
|
if (score > min_score) |
||||||
|
{ |
||||||
|
beam.push_back(pair< double,vector<int> >(score,childs[i])); |
||||||
|
sort(beam.begin(),beam.end(),beam_sort_function); |
||||||
|
if ((int)beam.size() > beam_size) |
||||||
|
{ |
||||||
|
beam.pop_back(); |
||||||
|
min_score = beam[beam.size()-1].first; |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////
|
||||||
|
// TODO Add heuristics to the score function (see PhotoOCR paper)
|
||||||
|
// e.g.: in some cases we discard a segmentation because it includes a very large character
|
||||||
|
// in other cases we do it because the overlapping between two chars is too large
|
||||||
|
// etc.
|
||||||
|
double score_segmentation(vector<int> &segmentation, vector< vector<double> > &observations, string& outstring) |
||||||
|
{ |
||||||
|
|
||||||
|
//TODO This must be extracted from dictionary
|
||||||
|
vector<double> start_p(vocabulary.size()); |
||||||
|
for (int i=0; i<(int)vocabulary.size(); i++) |
||||||
|
start_p[i] = log(1.0/vocabulary.size()); |
||||||
|
|
||||||
|
|
||||||
|
Mat V = Mat::ones((int)segmentation.size()-1,(int)vocabulary.size(),CV_64FC1); |
||||||
|
V = V * -DBL_MAX; |
||||||
|
vector<string> path(vocabulary.size()); |
||||||
|
|
||||||
|
// Initialize base cases (t == 0)
|
||||||
|
for (int i=0; i<(int)vocabulary.size(); i++) |
||||||
|
{ |
||||||
|
V.at<double>(0,i) = start_p[i] + observations[segmentation[1]-1][i]; |
||||||
|
path[i] = vocabulary.at(i); |
||||||
|
} |
||||||
|
|
||||||
|
|
||||||
|
// Run Viterbi for t > 0
|
||||||
|
for (int t=1; t<(int)segmentation.size()-1; t++) |
||||||
|
{ |
||||||
|
|
||||||
|
vector<string> newpath(vocabulary.size()); |
||||||
|
|
||||||
|
for (int i=0; i<(int)vocabulary.size(); i++) |
||||||
|
{ |
||||||
|
double max_prob = -DBL_MAX; |
||||||
|
int best_idx = 0; |
||||||
|
for (int j=0; j<(int)vocabulary.size(); j++) |
||||||
|
{ |
||||||
|
double prob = V.at<double>(t-1,j) + transition_p.at<double>(j,i) + observations[segmentation[t+1]-1][i]; |
||||||
|
if ( prob > max_prob) |
||||||
|
{ |
||||||
|
max_prob = prob; |
||||||
|
best_idx = j; |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
V.at<double>(t,i) = max_prob; |
||||||
|
newpath[i] = path[best_idx] + vocabulary.at(i); |
||||||
|
} |
||||||
|
|
||||||
|
// Don't need to remember the old paths
|
||||||
|
path.swap(newpath); |
||||||
|
} |
||||||
|
|
||||||
|
double max_prob = -DBL_MAX; |
||||||
|
int best_idx = 0; |
||||||
|
for (int i=0; i<(int)vocabulary.size(); i++) |
||||||
|
{ |
||||||
|
double prob = V.at<double>((int)segmentation.size()-2,i); |
||||||
|
if ( prob > max_prob) |
||||||
|
{ |
||||||
|
max_prob = prob; |
||||||
|
best_idx = i; |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
//cout << " score " << max_prob / (segmentation.size()-1) << " " << path[best_idx] << endl;
|
||||||
|
outstring = path[best_idx]; |
||||||
|
return max_prob / (segmentation.size()-1); |
||||||
|
} |
||||||
|
|
||||||
|
}; |
||||||
|
|
||||||
|
Ptr<OCRBeamSearchDecoder> OCRBeamSearchDecoder::create( Ptr<OCRBeamSearchDecoder::ClassifierCallback> _classifier, |
||||||
|
const string& _vocabulary, |
||||||
|
InputArray transition_p, |
||||||
|
InputArray emission_p, |
||||||
|
decoder_mode _mode, |
||||||
|
int _beam_size) |
||||||
|
{ |
||||||
|
return makePtr<OCRBeamSearchDecoderImpl>(_classifier, _vocabulary, transition_p, emission_p, _mode, _beam_size); |
||||||
|
} |
||||||
|
|
||||||
|
|
||||||
|
class CV_EXPORTS OCRBeamSearchClassifierCNN : public OCRBeamSearchDecoder::ClassifierCallback |
||||||
|
{ |
||||||
|
public: |
||||||
|
//constructor
|
||||||
|
OCRBeamSearchClassifierCNN(const std::string& filename); |
||||||
|
// Destructor
|
||||||
|
~OCRBeamSearchClassifierCNN() {} |
||||||
|
|
||||||
|
void eval( InputArray src, vector< vector<double> >& recognition_probabilities, vector<int>& oversegmentation ); |
||||||
|
|
||||||
|
protected: |
||||||
|
void normalizeAndZCA(Mat& patches); |
||||||
|
double eval_feature(Mat& feature, double* prob_estimates); |
||||||
|
|
||||||
|
private: |
||||||
|
//TODO implement getters/setters for some of these members (if apply)
|
||||||
|
int nr_class; // number of classes
|
||||||
|
int nr_feature; // number of features
|
||||||
|
Mat feature_min; // scale range
|
||||||
|
Mat feature_max; |
||||||
|
Mat weights; // Logistic Regression weights
|
||||||
|
Mat kernels; // CNN kernels
|
||||||
|
Mat M, P; // ZCA Whitening parameters
|
||||||
|
int step_size; // sliding window step
|
||||||
|
int window_size; // window size
|
||||||
|
int quad_size; |
||||||
|
int patch_size; |
||||||
|
int num_quads; // extract 25 quads (12x12) from each image
|
||||||
|
int num_tiles; // extract 25 patches (8x8) from each quad
|
||||||
|
double alpha; // used in non-linear activation function z = max(0, |D*a| - alpha)
|
||||||
|
}; |
||||||
|
|
||||||
|
OCRBeamSearchClassifierCNN::OCRBeamSearchClassifierCNN (const string& filename) |
||||||
|
{ |
||||||
|
if (ifstream(filename.c_str())) |
||||||
|
{ |
||||||
|
FileStorage fs(filename, FileStorage::READ); |
||||||
|
// Load kernels bank and withenning params
|
||||||
|
fs["kernels"] >> kernels; |
||||||
|
fs["M"] >> M; |
||||||
|
fs["P"] >> P; |
||||||
|
// Load Logistic Regression weights
|
||||||
|
fs["weights"] >> weights; |
||||||
|
// Load feature scaling ranges
|
||||||
|
fs["feature_min"] >> feature_min; |
||||||
|
fs["feature_max"] >> feature_max; |
||||||
|
fs.release(); |
||||||
|
// TODO check all matrix dimensions match correctly and no one is empty
|
||||||
|
} |
||||||
|
else |
||||||
|
CV_Error(Error::StsBadArg, "Default classifier data file not found!"); |
||||||
|
|
||||||
|
nr_feature = weights.rows; |
||||||
|
nr_class = weights.cols; |
||||||
|
// TODO some of this can be inferred from the input file (e.g. patch size must be sqrt(filters.cols))
|
||||||
|
step_size = 4; |
||||||
|
window_size = 32; |
||||||
|
quad_size = 12; |
||||||
|
patch_size = 8; |
||||||
|
num_quads = 25; |
||||||
|
num_tiles = 25; |
||||||
|
alpha = 0.5; |
||||||
|
|
||||||
|
|
||||||
|
} |
||||||
|
|
||||||
|
void OCRBeamSearchClassifierCNN::eval( InputArray _src, vector< vector<double> >& recognition_probabilities, vector<int>& oversegmentation) |
||||||
|
{ |
||||||
|
|
||||||
|
CV_Assert(( _src.getMat().type() == CV_8UC3 ) || ( _src.getMat().type() == CV_8UC1 )); |
||||||
|
if (!recognition_probabilities.empty()) |
||||||
|
{ |
||||||
|
for (size_t i=0; i<recognition_probabilities.size(); i++) |
||||||
|
recognition_probabilities[i].clear(); |
||||||
|
} |
||||||
|
recognition_probabilities.clear(); |
||||||
|
oversegmentation.clear(); |
||||||
|
|
||||||
|
|
||||||
|
Mat src = _src.getMat(); |
||||||
|
if(src.type() == CV_8UC3) |
||||||
|
{ |
||||||
|
cvtColor(src,src,COLOR_RGB2GRAY); |
||||||
|
} |
||||||
|
|
||||||
|
// TODO shall we resize the input image or make a copy ?
|
||||||
|
resize(src,src,Size(window_size*src.cols/src.rows,window_size)); |
||||||
|
|
||||||
|
int seg_points = 0; |
||||||
|
oversegmentation.push_back(seg_points); |
||||||
|
|
||||||
|
Mat quad; |
||||||
|
Mat tmp; |
||||||
|
Mat img; |
||||||
|
|
||||||
|
// begin sliding window loop foreach detection window
|
||||||
|
for (int x_c=0; x_c<=src.cols-window_size; x_c=x_c+step_size) |
||||||
|
{ |
||||||
|
|
||||||
|
img = src(Rect(Point(x_c,0),Size(window_size,window_size))); |
||||||
|
|
||||||
|
int patch_count = 0; |
||||||
|
vector< vector<double> > data_pool(9); |
||||||
|
|
||||||
|
|
||||||
|
int quad_id = 1; |
||||||
|
for (int q_x=0; q_x<=window_size-quad_size; q_x=q_x+(quad_size/2-1)) |
||||||
|
{ |
||||||
|
for (int q_y=0; q_y<=window_size-quad_size; q_y=q_y+(quad_size/2-1)) |
||||||
|
{ |
||||||
|
Rect quad_rect = Rect(q_x,q_y,quad_size,quad_size); |
||||||
|
quad = img(quad_rect); |
||||||
|
|
||||||
|
//start sliding window (8x8) in each tile and store the patch as row in data_pool
|
||||||
|
for (int w_x=0; w_x<=quad_size-patch_size; w_x++) |
||||||
|
{ |
||||||
|
for (int w_y=0; w_y<=quad_size-patch_size; w_y++) |
||||||
|
{ |
||||||
|
quad(Rect(w_x,w_y,patch_size,patch_size)).copyTo(tmp); |
||||||
|
tmp = tmp.reshape(0,1); |
||||||
|
tmp.convertTo(tmp, CV_64F); |
||||||
|
normalizeAndZCA(tmp); |
||||||
|
vector<double> patch; |
||||||
|
tmp.copyTo(patch); |
||||||
|
if ((quad_id == 1)||(quad_id == 2)||(quad_id == 6)||(quad_id == 7)) |
||||||
|
data_pool[0].insert(data_pool[0].end(),patch.begin(),patch.end()); |
||||||
|
if ((quad_id == 2)||(quad_id == 7)||(quad_id == 3)||(quad_id == 8)||(quad_id == 4)||(quad_id == 9)) |
||||||
|
data_pool[1].insert(data_pool[1].end(),patch.begin(),patch.end()); |
||||||
|
if ((quad_id == 4)||(quad_id == 9)||(quad_id == 5)||(quad_id == 10)) |
||||||
|
data_pool[2].insert(data_pool[2].end(),patch.begin(),patch.end()); |
||||||
|
if ((quad_id == 6)||(quad_id == 11)||(quad_id == 16)||(quad_id == 7)||(quad_id == 12)||(quad_id == 17)) |
||||||
|
data_pool[3].insert(data_pool[3].end(),patch.begin(),patch.end()); |
||||||
|
if ((quad_id == 7)||(quad_id == 12)||(quad_id == 17)||(quad_id == 8)||(quad_id == 13)||(quad_id == 18)||(quad_id == 9)||(quad_id == 14)||(quad_id == 19)) |
||||||
|
data_pool[4].insert(data_pool[4].end(),patch.begin(),patch.end()); |
||||||
|
if ((quad_id == 9)||(quad_id == 14)||(quad_id == 19)||(quad_id == 10)||(quad_id == 15)||(quad_id == 20)) |
||||||
|
data_pool[5].insert(data_pool[5].end(),patch.begin(),patch.end()); |
||||||
|
if ((quad_id == 16)||(quad_id == 21)||(quad_id == 17)||(quad_id == 22)) |
||||||
|
data_pool[6].insert(data_pool[6].end(),patch.begin(),patch.end()); |
||||||
|
if ((quad_id == 17)||(quad_id == 22)||(quad_id == 18)||(quad_id == 23)||(quad_id == 19)||(quad_id == 24)) |
||||||
|
data_pool[7].insert(data_pool[7].end(),patch.begin(),patch.end()); |
||||||
|
if ((quad_id == 19)||(quad_id == 24)||(quad_id == 20)||(quad_id == 25)) |
||||||
|
data_pool[8].insert(data_pool[8].end(),patch.begin(),patch.end()); |
||||||
|
patch_count++; |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
quad_id++; |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
//do dot product of each normalized and whitened patch
|
||||||
|
//each pool is averaged and this yields a representation of 9xD
|
||||||
|
Mat feature = Mat::zeros(9,kernels.rows,CV_64FC1); |
||||||
|
for (int i=0; i<9; i++) |
||||||
|
{ |
||||||
|
Mat pool = Mat(data_pool[i]); |
||||||
|
pool = pool.reshape(0,data_pool[i].size()/kernels.cols); |
||||||
|
for (int p=0; p<pool.rows; p++) |
||||||
|
{ |
||||||
|
for (int f=0; f<kernels.rows; f++) |
||||||
|
{ |
||||||
|
feature.row(i).at<double>(0,f) = feature.row(i).at<double>(0,f) + max(0.0,std::abs(pool.row(p).dot(kernels.row(f)))-alpha); |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
feature = feature.reshape(0,1); |
||||||
|
|
||||||
|
|
||||||
|
// data must be normalized within the range obtained during training
|
||||||
|
double lower = -1.0; |
||||||
|
double upper = 1.0; |
||||||
|
for (int k=0; k<feature.cols; k++) |
||||||
|
{ |
||||||
|
feature.at<double>(0,k) = lower + (upper-lower) * |
||||||
|
(feature.at<double>(0,k)-feature_min.at<double>(0,k))/ |
||||||
|
(feature_max.at<double>(0,k)-feature_min.at<double>(0,k)); |
||||||
|
} |
||||||
|
|
||||||
|
double probabilities[nr_class]; |
||||||
|
double *p = &probabilities[0]; |
||||||
|
double predict_label = eval_feature(feature,p); |
||||||
|
//cout << " Prediction: " << vocabulary[predict_label] << " with probability " << p[0] << endl;
|
||||||
|
if (predict_label < 0) // TODO use cvError
|
||||||
|
cout << "OCRBeamSearchClassifierCNN::eval Error: unexpected prediction in eval_feature()" << endl; |
||||||
|
|
||||||
|
|
||||||
|
seg_points++; |
||||||
|
oversegmentation.push_back(seg_points); |
||||||
|
vector<double> recognition_p(probabilities, probabilities+sizeof(probabilities)/sizeof(double)); |
||||||
|
recognition_probabilities.push_back(recognition_p); |
||||||
|
|
||||||
|
} |
||||||
|
|
||||||
|
|
||||||
|
} |
||||||
|
|
||||||
|
// normalize for contrast and apply ZCA whitening to a set of image patches
|
||||||
|
void OCRBeamSearchClassifierCNN::normalizeAndZCA(Mat& patches) |
||||||
|
{ |
||||||
|
|
||||||
|
//Normalize for contrast
|
||||||
|
for (int i=0; i<patches.rows; i++) |
||||||
|
{ |
||||||
|
Scalar row_mean, row_std; |
||||||
|
meanStdDev(patches.row(i),row_mean,row_std); |
||||||
|
row_std[0] = sqrt(pow(row_std[0],2)*patches.cols/(patches.cols-1)+10); |
||||||
|
patches.row(i) = (patches.row(i) - row_mean[0]) / row_std[0]; |
||||||
|
} |
||||||
|
|
||||||
|
|
||||||
|
//ZCA whitening
|
||||||
|
if ((M.dims == 0) || (P.dims == 0)) |
||||||
|
{ |
||||||
|
Mat CC; |
||||||
|
calcCovarMatrix(patches,CC,M,COVAR_NORMAL|COVAR_ROWS|COVAR_SCALE); |
||||||
|
CC = CC * patches.rows / (patches.rows-1); |
||||||
|
|
||||||
|
|
||||||
|
Mat e_val,e_vec; |
||||||
|
eigen(CC.t(),e_val,e_vec); |
||||||
|
e_vec = e_vec.t(); |
||||||
|
sqrt(1./(e_val + 0.1), e_val); |
||||||
|
|
||||||
|
|
||||||
|
Mat V = Mat::zeros(e_vec.rows, e_vec.cols, CV_64FC1); |
||||||
|
Mat D = Mat::eye(e_vec.rows, e_vec.cols, CV_64FC1); |
||||||
|
|
||||||
|
for (int i=0; i<e_vec.cols; i++) |
||||||
|
{ |
||||||
|
e_vec.col(e_vec.cols-i-1).copyTo(V.col(i)); |
||||||
|
D.col(i) = D.col(i) * e_val.at<double>(0,e_val.rows-i-1); |
||||||
|
} |
||||||
|
|
||||||
|
P = V * D * V.t(); |
||||||
|
} |
||||||
|
|
||||||
|
for (int i=0; i<patches.rows; i++) |
||||||
|
patches.row(i) = patches.row(i) - M; |
||||||
|
|
||||||
|
patches = patches * P; |
||||||
|
|
||||||
|
} |
||||||
|
|
||||||
|
double OCRBeamSearchClassifierCNN::eval_feature(Mat& feature, double* prob_estimates) |
||||||
|
{ |
||||||
|
for(int i=0;i<nr_class;i++) |
||||||
|
prob_estimates[i] = 0; |
||||||
|
|
||||||
|
for(int idx=0; idx<nr_feature; idx++) |
||||||
|
for(int i=0;i<nr_class;i++) |
||||||
|
prob_estimates[i] += weights.at<float>(idx,i)*feature.at<double>(0,idx); //TODO use vectorized dot product
|
||||||
|
|
||||||
|
int dec_max_idx = 0; |
||||||
|
for(int i=1;i<nr_class;i++) |
||||||
|
{ |
||||||
|
if(prob_estimates[i] > prob_estimates[dec_max_idx]) |
||||||
|
dec_max_idx = i; |
||||||
|
} |
||||||
|
|
||||||
|
for(int i=0;i<nr_class;i++) |
||||||
|
prob_estimates[i]=1/(1+exp(-prob_estimates[i])); |
||||||
|
|
||||||
|
double sum=0; |
||||||
|
for(int i=0; i<nr_class; i++) |
||||||
|
sum+=prob_estimates[i]; |
||||||
|
|
||||||
|
for(int i=0; i<nr_class; i++) |
||||||
|
prob_estimates[i]=prob_estimates[i]/sum; |
||||||
|
|
||||||
|
return dec_max_idx; |
||||||
|
} |
||||||
|
|
||||||
|
|
||||||
|
Ptr<OCRBeamSearchDecoder::ClassifierCallback> loadOCRBeamSearchClassifierCNN(const std::string& filename) |
||||||
|
|
||||||
|
{ |
||||||
|
return makePtr<OCRBeamSearchClassifierCNN>(filename); |
||||||
|
} |
||||||
|
|
||||||
|
} |
||||||
|
} |
Loading…
Reference in new issue