@ -74,6 +74,22 @@ void OCRHMMDecoder::run(Mat& image, string& output_text, vector<Rect>* component
component_confidences - > clear ( ) ;
}
void OCRHMMDecoder : : run ( Mat & image , Mat & mask , string & output_text , vector < Rect > * component_rects ,
vector < string > * component_texts , vector < float > * component_confidences ,
int component_level )
{
CV_Assert ( ( image . type ( ) = = CV_8UC1 ) | | ( image . type ( ) = = CV_8UC3 ) ) ;
CV_Assert ( mask . type ( ) = = CV_8UC1 ) ;
CV_Assert ( ( component_level = = OCR_LEVEL_TEXTLINE ) | | ( component_level = = OCR_LEVEL_WORD ) ) ;
output_text . clear ( ) ;
if ( component_rects ! = NULL )
component_rects - > clear ( ) ;
if ( component_texts ! = NULL )
component_texts - > clear ( ) ;
if ( component_confidences ! = NULL )
component_confidences - > clear ( ) ;
}
void OCRHMMDecoder : : ClassifierCallback : : eval ( InputArray image , vector < int > & out_class , vector < double > & out_confidence )
{
CV_Assert ( ( image . getMat ( ) . type ( ) = = CV_8UC3 ) | | ( image . getMat ( ) . type ( ) = = CV_8UC1 ) ) ;
@ -263,6 +279,263 @@ public:
obs . push_back ( out_class [ 0 ] ) ;
observations . push_back ( out_class ) ;
confidences . push_back ( out_conf ) ;
//cout << " out class = " << vocabulary[out_class[0]] << endl;
}
//This must be extracted from dictionary, or just assumed to be equal for all characters
vector < double > start_p ( vocabulary . size ( ) ) ;
for ( int i = 0 ; i < ( int ) vocabulary . size ( ) ; i + + )
start_p [ i ] = 1.0 / vocabulary . size ( ) ;
Mat V = Mat : : zeros ( ( int ) observations . size ( ) , ( int ) vocabulary . size ( ) , CV_64FC1 ) ;
vector < string > path ( vocabulary . size ( ) ) ;
// Initialize base cases (t == 0)
for ( int i = 0 ; i < ( int ) vocabulary . size ( ) ; i + + )
{
for ( int j = 0 ; j < ( int ) observations [ 0 ] . size ( ) ; j + + )
{
emission_p . at < double > ( observations [ 0 ] [ j ] , obs [ 0 ] ) = confidences [ 0 ] [ j ] ;
}
V . at < double > ( 0 , i ) = start_p [ i ] * emission_p . at < double > ( i , obs [ 0 ] ) ;
path [ i ] = vocabulary . at ( i ) ;
}
// Run Viterbi for t > 0
for ( int t = 1 ; t < ( int ) obs . size ( ) ; t + + )
{
//Dude this has to be done each time!!
emission_p = Mat : : eye ( 62 , 62 , CV_64FC1 ) ;
for ( int e = 0 ; e < ( int ) observations [ t ] . size ( ) ; e + + )
{
emission_p . at < double > ( observations [ t ] [ e ] , obs [ t ] ) = confidences [ t ] [ e ] ;
}
vector < string > newpath ( vocabulary . size ( ) ) ;
for ( int i = 0 ; i < ( int ) vocabulary . size ( ) ; i + + )
{
double max_prob = 0 ;
int best_idx = 0 ;
for ( int j = 0 ; j < ( int ) vocabulary . size ( ) ; j + + )
{
double prob = V . at < double > ( t - 1 , j ) * transition_p . at < double > ( j , i ) * emission_p . at < double > ( i , obs [ t ] ) ;
if ( prob > max_prob )
{
max_prob = prob ;
best_idx = j ;
}
}
V . at < double > ( t , i ) = max_prob ;
newpath [ i ] = path [ best_idx ] + vocabulary . at ( i ) ;
}
// Don't need to remember the old paths
path . swap ( newpath ) ;
}
double max_prob = 0 ;
int best_idx = 0 ;
for ( int i = 0 ; i < ( int ) vocabulary . size ( ) ; i + + )
{
double prob = V . at < double > ( ( int ) obs . size ( ) - 1 , i ) ;
if ( prob > max_prob )
{
max_prob = prob ;
best_idx = i ;
}
}
//cout << path[best_idx] << endl;
out_sequence = out_sequence + " " + path [ best_idx ] ;
if ( component_rects ! = NULL )
component_rects - > push_back ( words_rect [ w ] ) ;
if ( component_texts ! = NULL )
component_texts - > push_back ( path [ best_idx ] ) ;
if ( component_confidences ! = NULL )
component_confidences - > push_back ( ( float ) max_prob ) ;
}
return ;
}
void run ( Mat & image ,
Mat & mask ,
string & out_sequence ,
vector < Rect > * component_rects ,
vector < string > * component_texts ,
vector < float > * component_confidences ,
int component_level )
{
CV_Assert ( ( image . type ( ) = = CV_8UC1 ) | | ( image . type ( ) = = CV_8UC3 ) ) ;
CV_Assert ( mask . type ( ) = = CV_8UC1 ) ;
CV_Assert ( ( image . cols > 0 ) & & ( image . rows > 0 ) ) ;
CV_Assert ( ( image . cols = = mask . cols ) & & ( image . rows = = mask . rows ) ) ;
CV_Assert ( component_level = = OCR_LEVEL_WORD ) ;
out_sequence . clear ( ) ;
if ( component_rects ! = NULL )
component_rects - > clear ( ) ;
if ( component_texts ! = NULL )
component_texts - > clear ( ) ;
if ( component_confidences ! = NULL )
component_confidences - > clear ( ) ;
// First we split a line into words
vector < Mat > words_mask ;
vector < Rect > words_rect ;
/// Find contours
vector < vector < Point > > contours ;
vector < Vec4i > hierarchy ;
Mat tmp ;
mask . copyTo ( tmp ) ;
findContours ( tmp , contours , hierarchy , RETR_EXTERNAL , CHAIN_APPROX_SIMPLE , Point ( 0 , 0 ) ) ;
if ( contours . size ( ) < 6 )
{
//do not split lines with less than 6 characters
words_mask . push_back ( mask ) ;
words_rect . push_back ( Rect ( 0 , 0 , mask . cols , mask . rows ) ) ;
}
else
{
Mat_ < float > vector_w ( ( int ) mask . cols , 1 ) ;
reduce ( mask , vector_w , 0 , REDUCE_SUM , - 1 ) ;
vector < int > spaces ;
vector < int > spaces_start ;
vector < int > spaces_end ;
int space_count = 0 ;
int last_one_idx ;
int s_init = 0 , s_end = vector_w . cols ;
for ( int s = 0 ; s < vector_w . cols ; s + + )
{
if ( vector_w . at < float > ( 0 , s ) = = 0 )
s_init = s + 1 ;
else
break ;
}
for ( int s = vector_w . cols - 1 ; s > = 0 ; s - - )
{
if ( vector_w . at < float > ( 0 , s ) = = 0 )
s_end = s ;
else
break ;
}
for ( int s = s_init ; s < s_end ; s + + )
{
if ( vector_w . at < float > ( 0 , s ) = = 0 )
{
space_count + + ;
} else {
if ( space_count ! = 0 )
{
spaces . push_back ( space_count ) ;
spaces_start . push_back ( last_one_idx ) ;
spaces_end . push_back ( s - 1 ) ;
}
space_count = 0 ;
last_one_idx = s ;
}
}
Scalar mean_space , std_space ;
meanStdDev ( Mat ( spaces ) , mean_space , std_space ) ;
int num_word_spaces = 0 ;
int last_word_space_end = 0 ;
for ( int s = 0 ; s < ( int ) spaces . size ( ) ; s + + )
{
if ( spaces_end . at ( s ) - spaces_start . at ( s ) > mean_space [ 0 ] + ( mean_space [ 0 ] * 1.1 ) ) //this 1.1 is a param?
{
if ( num_word_spaces = = 0 )
{
//cout << " we have a word from 0 to " << spaces_start.at(s) << endl;
Mat word_mask ;
Rect word_rect = Rect ( 0 , 0 , spaces_start . at ( s ) , mask . rows ) ;
mask ( word_rect ) . copyTo ( word_mask ) ;
words_mask . push_back ( word_mask ) ;
words_rect . push_back ( word_rect ) ;
}
else
{
//cout << " we have a word from " << last_word_space_end << " to " << spaces_start.at(s) << endl;
Mat word_mask ;
Rect word_rect = Rect ( last_word_space_end , 0 , spaces_start . at ( s ) - last_word_space_end , mask . rows ) ;
mask ( word_rect ) . copyTo ( word_mask ) ;
words_mask . push_back ( word_mask ) ;
words_rect . push_back ( word_rect ) ;
}
num_word_spaces + + ;
last_word_space_end = spaces_end . at ( s ) ;
}
}
//cout << " we have a word from " << last_word_space_end << " to " << vector_w.cols << endl << endl << endl;
Mat word_mask ;
Rect word_rect = Rect ( last_word_space_end , 0 , vector_w . cols - last_word_space_end , mask . rows ) ;
mask ( word_rect ) . copyTo ( word_mask ) ;
words_mask . push_back ( word_mask ) ;
words_rect . push_back ( word_rect ) ;
}
for ( int w = 0 ; w < ( int ) words_mask . size ( ) ; w + + )
{
vector < vector < int > > observations ;
vector < vector < double > > confidences ;
vector < int > obs ;
// First find contours and sort by x coordinate of bbox
words_mask [ w ] . copyTo ( tmp ) ;
if ( tmp . empty ( ) )
continue ;
contours . clear ( ) ;
hierarchy . clear ( ) ;
/// Find contours
findContours ( tmp , contours , hierarchy , RETR_EXTERNAL , CHAIN_APPROX_SIMPLE , Point ( 0 , 0 ) ) ;
vector < Rect > contours_rect ;
for ( int i = 0 ; i < ( int ) contours . size ( ) ; i + + )
{
contours_rect . push_back ( boundingRect ( contours [ i ] ) ) ;
}
sort ( contours_rect . begin ( ) , contours_rect . end ( ) , sort_rect_horiz ) ;
// Do character recognition foreach contour
for ( int i = 0 ; i < ( int ) contours . size ( ) ; i + + )
{
vector < int > out_class ;
vector < double > out_conf ;
//take the center of the char rect and translate it to the real origin
Point char_center = Point ( contours_rect . at ( i ) . x + contours_rect . at ( i ) . width / 2 ,
contours_rect . at ( i ) . y + contours_rect . at ( i ) . height / 2 ) ;
char_center . x + = words_rect [ w ] . x ;
char_center . y + = words_rect [ w ] . y ;
int win_size = max ( contours_rect . at ( i ) . width , contours_rect . at ( i ) . height ) ;
win_size + = win_size * 0.6 ; // add some pixels in the border TODO: is this a parameter for the user space?
Rect char_rect = Rect ( char_center . x - win_size / 2 , char_center . y - win_size / 2 , win_size , win_size ) ;
char_rect & = Rect ( 0 , 0 , image . cols , image . rows ) ;
Mat tmp_image ;
image ( char_rect ) . copyTo ( tmp_image ) ;
classifier - > eval ( tmp_image , out_class , out_conf ) ;
if ( ! out_class . empty ( ) )
obs . push_back ( out_class [ 0 ] ) ;
//cout << " out class = " << vocabulary[out_class[0]] << "(" << out_conf[0] << ")" << endl;
observations . push_back ( out_class ) ;
confidences . push_back ( out_conf ) ;
}
@ -598,6 +871,278 @@ Ptr<OCRHMMDecoder::ClassifierCallback> loadOCRHMMClassifierNM(const std::string&
return makePtr < OCRHMMClassifierKNN > ( filename ) ;
}
class CV_EXPORTS OCRHMMClassifierCNN : public OCRHMMDecoder : : ClassifierCallback
{
public :
//constructor
OCRHMMClassifierCNN ( const std : : string & filename ) ;
// Destructor
~ OCRHMMClassifierCNN ( ) { }
void eval ( InputArray image , vector < int > & out_class , vector < double > & out_confidence ) ;
protected :
void normalizeAndZCA ( Mat & patches ) ;
double eval_feature ( Mat & feature , double * prob_estimates ) ;
private :
int nr_class ; // number of classes
int nr_feature ; // number of features
Mat feature_min ; // scale range
Mat feature_max ;
Mat weights ; // Logistic Regression weights
Mat kernels ; // CNN kernels
Mat M , P ; // ZCA Whitening parameters
int window_size ; // window size
int quad_size ;
int patch_size ;
int num_quads ; // extract 25 quads (12x12) from each image
int num_tiles ; // extract 25 patches (8x8) from each quad
double alpha ; // used in non-linear activation function z = max(0, |D*a| - alpha)
} ;
OCRHMMClassifierCNN : : OCRHMMClassifierCNN ( const string & filename )
{
if ( ifstream ( filename . c_str ( ) ) )
{
FileStorage fs ( filename , FileStorage : : READ ) ;
// Load kernels bank and withenning params
fs [ " kernels " ] > > kernels ;
fs [ " M " ] > > M ;
fs [ " P " ] > > P ;
// Load Logistic Regression weights
fs [ " weights " ] > > weights ;
// Load feature scaling ranges
fs [ " feature_min " ] > > feature_min ;
fs [ " feature_max " ] > > feature_max ;
fs . release ( ) ;
}
else
CV_Error ( Error : : StsBadArg , " Default classifier data file not found! " ) ;
// check all matrix dimensions match correctly and no one is empty
CV_Assert ( ( M . cols > 0 ) & & ( M . rows > 0 ) ) ;
CV_Assert ( ( P . cols > 0 ) & & ( P . rows > 0 ) ) ;
CV_Assert ( ( kernels . cols > 0 ) & & ( kernels . rows > 0 ) ) ;
CV_Assert ( ( weights . cols > 0 ) & & ( weights . rows > 0 ) ) ;
CV_Assert ( ( feature_min . cols > 0 ) & & ( feature_min . rows > 0 ) ) ;
CV_Assert ( ( feature_max . cols > 0 ) & & ( feature_max . rows > 0 ) ) ;
nr_feature = weights . rows ;
nr_class = weights . cols ;
patch_size = sqrt ( kernels . cols ) ;
// algorithm internal parameters
window_size = 32 ;
num_quads = 25 ;
num_tiles = 25 ;
quad_size = 12 ;
alpha = 0.5 ;
}
void OCRHMMClassifierCNN : : eval ( InputArray _src , vector < int > & out_class , vector < double > & out_confidence )
{
CV_Assert ( ( _src . getMat ( ) . type ( ) = = CV_8UC3 ) | | ( _src . getMat ( ) . type ( ) = = CV_8UC1 ) ) ;
out_class . clear ( ) ;
out_confidence . clear ( ) ;
Mat img = _src . getMat ( ) ;
if ( img . type ( ) = = CV_8UC3 )
{
cvtColor ( img , img , COLOR_RGB2GRAY ) ;
}
// shall we resize the input image or make a copy ?
resize ( img , img , Size ( window_size , window_size ) ) ;
Mat quad ;
Mat tmp ;
int patch_count = 0 ;
vector < vector < double > > data_pool ( 9 ) ;
int quad_id = 1 ;
for ( int q_x = 0 ; q_x < = window_size - quad_size ; q_x = q_x + ( quad_size / 2 - 1 ) )
{
for ( int q_y = 0 ; q_y < = window_size - quad_size ; q_y = q_y + ( quad_size / 2 - 1 ) )
{
Rect quad_rect = Rect ( q_x , q_y , quad_size , quad_size ) ;
quad = img ( quad_rect ) ;
//start sliding window (8x8) in each tile and store the patch as row in data_pool
for ( int w_x = 0 ; w_x < = quad_size - patch_size ; w_x + + )
{
for ( int w_y = 0 ; w_y < = quad_size - patch_size ; w_y + + )
{
quad ( Rect ( w_x , w_y , patch_size , patch_size ) ) . copyTo ( tmp ) ;
tmp = tmp . reshape ( 0 , 1 ) ;
tmp . convertTo ( tmp , CV_64F ) ;
normalizeAndZCA ( tmp ) ;
vector < double > patch ;
tmp . copyTo ( patch ) ;
if ( ( quad_id = = 1 ) | | ( quad_id = = 2 ) | | ( quad_id = = 6 ) | | ( quad_id = = 7 ) )
data_pool [ 0 ] . insert ( data_pool [ 0 ] . end ( ) , patch . begin ( ) , patch . end ( ) ) ;
if ( ( quad_id = = 2 ) | | ( quad_id = = 7 ) | | ( quad_id = = 3 ) | | ( quad_id = = 8 ) | | ( quad_id = = 4 ) | | ( quad_id = = 9 ) )
data_pool [ 1 ] . insert ( data_pool [ 1 ] . end ( ) , patch . begin ( ) , patch . end ( ) ) ;
if ( ( quad_id = = 4 ) | | ( quad_id = = 9 ) | | ( quad_id = = 5 ) | | ( quad_id = = 10 ) )
data_pool [ 2 ] . insert ( data_pool [ 2 ] . end ( ) , patch . begin ( ) , patch . end ( ) ) ;
if ( ( quad_id = = 6 ) | | ( quad_id = = 11 ) | | ( quad_id = = 16 ) | | ( quad_id = = 7 ) | | ( quad_id = = 12 ) | | ( quad_id = = 17 ) )
data_pool [ 3 ] . insert ( data_pool [ 3 ] . end ( ) , patch . begin ( ) , patch . end ( ) ) ;
if ( ( quad_id = = 7 ) | | ( quad_id = = 12 ) | | ( quad_id = = 17 ) | | ( quad_id = = 8 ) | | ( quad_id = = 13 ) | | ( quad_id = = 18 ) | | ( quad_id = = 9 ) | | ( quad_id = = 14 ) | | ( quad_id = = 19 ) )
data_pool [ 4 ] . insert ( data_pool [ 4 ] . end ( ) , patch . begin ( ) , patch . end ( ) ) ;
if ( ( quad_id = = 9 ) | | ( quad_id = = 14 ) | | ( quad_id = = 19 ) | | ( quad_id = = 10 ) | | ( quad_id = = 15 ) | | ( quad_id = = 20 ) )
data_pool [ 5 ] . insert ( data_pool [ 5 ] . end ( ) , patch . begin ( ) , patch . end ( ) ) ;
if ( ( quad_id = = 16 ) | | ( quad_id = = 21 ) | | ( quad_id = = 17 ) | | ( quad_id = = 22 ) )
data_pool [ 6 ] . insert ( data_pool [ 6 ] . end ( ) , patch . begin ( ) , patch . end ( ) ) ;
if ( ( quad_id = = 17 ) | | ( quad_id = = 22 ) | | ( quad_id = = 18 ) | | ( quad_id = = 23 ) | | ( quad_id = = 19 ) | | ( quad_id = = 24 ) )
data_pool [ 7 ] . insert ( data_pool [ 7 ] . end ( ) , patch . begin ( ) , patch . end ( ) ) ;
if ( ( quad_id = = 19 ) | | ( quad_id = = 24 ) | | ( quad_id = = 20 ) | | ( quad_id = = 25 ) )
data_pool [ 8 ] . insert ( data_pool [ 8 ] . end ( ) , patch . begin ( ) , patch . end ( ) ) ;
patch_count + + ;
}
}
quad_id + + ;
}
}
//do dot product of each normalized and whitened patch
//each pool is averaged and this yields a representation of 9xD
Mat feature = Mat : : zeros ( 9 , kernels . rows , CV_64FC1 ) ;
for ( int i = 0 ; i < 9 ; i + + )
{
Mat pool = Mat ( data_pool [ i ] ) ;
pool = pool . reshape ( 0 , ( int ) data_pool [ i ] . size ( ) / kernels . cols ) ;
for ( int p = 0 ; p < pool . rows ; p + + )
{
for ( int f = 0 ; f < kernels . rows ; f + + )
{
feature . row ( i ) . at < double > ( 0 , f ) = feature . row ( i ) . at < double > ( 0 , f ) + max ( 0.0 , std : : abs ( pool . row ( p ) . dot ( kernels . row ( f ) ) ) - alpha ) ;
}
}
}
feature = feature . reshape ( 0 , 1 ) ;
// data must be normalized within the range obtained during training
double lower = - 1.0 ;
double upper = 1.0 ;
for ( int k = 0 ; k < feature . cols ; k + + )
{
feature . at < double > ( 0 , k ) = lower + ( upper - lower ) *
( feature . at < double > ( 0 , k ) - feature_min . at < double > ( 0 , k ) ) /
( feature_max . at < double > ( 0 , k ) - feature_min . at < double > ( 0 , k ) ) ;
}
double * p = new double [ nr_class ] ;
double predict_label = eval_feature ( feature , p ) ;
//cout << " Prediction: " << vocabulary[predict_label] << " with probability " << p[0] << endl;
if ( predict_label < 0 )
CV_Error ( Error : : StsInternal , " OCRHMMClassifierCNN::eval Error: unexpected prediction in eval_feature() " ) ;
out_class . push_back ( ( int ) predict_label ) ;
out_confidence . push_back ( p [ ( int ) predict_label ] ) ;
for ( int i = 0 ; i < nr_class ; i + + )
{
if ( ( i ! = ( int ) predict_label ) & & ( p [ i ] ! = 0. ) )
{
out_class . push_back ( i ) ;
out_confidence . push_back ( p [ i ] ) ;
}
}
}
// normalize for contrast and apply ZCA whitening to a set of image patches
void OCRHMMClassifierCNN : : normalizeAndZCA ( Mat & patches )
{
//Normalize for contrast
for ( int i = 0 ; i < patches . rows ; i + + )
{
Scalar row_mean , row_std ;
meanStdDev ( patches . row ( i ) , row_mean , row_std ) ;
row_std [ 0 ] = sqrt ( pow ( row_std [ 0 ] , 2 ) * patches . cols / ( patches . cols - 1 ) + 10 ) ;
patches . row ( i ) = ( patches . row ( i ) - row_mean [ 0 ] ) / row_std [ 0 ] ;
}
//ZCA whitening
if ( ( M . dims = = 0 ) | | ( P . dims = = 0 ) )
{
Mat CC ;
calcCovarMatrix ( patches , CC , M , COVAR_NORMAL | COVAR_ROWS | COVAR_SCALE ) ;
CC = CC * patches . rows / ( patches . rows - 1 ) ;
Mat e_val , e_vec ;
eigen ( CC . t ( ) , e_val , e_vec ) ;
e_vec = e_vec . t ( ) ;
sqrt ( 1. / ( e_val + 0.1 ) , e_val ) ;
Mat V = Mat : : zeros ( e_vec . rows , e_vec . cols , CV_64FC1 ) ;
Mat D = Mat : : eye ( e_vec . rows , e_vec . cols , CV_64FC1 ) ;
for ( int i = 0 ; i < e_vec . cols ; i + + )
{
e_vec . col ( e_vec . cols - i - 1 ) . copyTo ( V . col ( i ) ) ;
D . col ( i ) = D . col ( i ) * e_val . at < double > ( 0 , e_val . rows - i - 1 ) ;
}
P = V * D * V . t ( ) ;
}
for ( int i = 0 ; i < patches . rows ; i + + )
patches . row ( i ) = patches . row ( i ) - M ;
patches = patches * P ;
}
double OCRHMMClassifierCNN : : eval_feature ( Mat & feature , double * prob_estimates )
{
for ( int i = 0 ; i < nr_class ; i + + )
prob_estimates [ i ] = 0 ;
for ( int idx = 0 ; idx < nr_feature ; idx + + )
for ( int i = 0 ; i < nr_class ; i + + )
prob_estimates [ i ] + = weights . at < float > ( idx , i ) * feature . at < double > ( 0 , idx ) ; //TODO use vectorized dot product
int dec_max_idx = 0 ;
for ( int i = 1 ; i < nr_class ; i + + )
{
if ( prob_estimates [ i ] > prob_estimates [ dec_max_idx ] )
dec_max_idx = i ;
}
for ( int i = 0 ; i < nr_class ; i + + )
prob_estimates [ i ] = 1 / ( 1 + exp ( - prob_estimates [ i ] ) ) ;
double sum = 0 ;
for ( int i = 0 ; i < nr_class ; i + + )
sum + = prob_estimates [ i ] ;
for ( int i = 0 ; i < nr_class ; i + + )
prob_estimates [ i ] = prob_estimates [ i ] / sum ;
return dec_max_idx ;
}
Ptr < OCRHMMDecoder : : ClassifierCallback > loadOCRHMMClassifierCNN ( const std : : string & filename )
{
return makePtr < OCRHMMClassifierCNN > ( filename ) ;
}
/** @brief Utility function to create a tailored language model transitions table from a given list of words (lexicon).
@ param vocabulary The language vocabulary ( chars when ascii english text ) .