diff --git a/modules/text/include/opencv2/text/ocr.hpp b/modules/text/include/opencv2/text/ocr.hpp index 9593a1415..bd1c18ffb 100644 --- a/modules/text/include/opencv2/text/ocr.hpp +++ b/modules/text/include/opencv2/text/ocr.hpp @@ -716,10 +716,6 @@ public: /** @brief produces a class confidence row-vector given an image */ CV_WRAP virtual void classify(InputArray image, OutputArray classProbabilities) = 0; - /** @brief produces a list of bounding box given an image - */ - - CV_WRAP virtual void detect(InputArray image, OutputArray classProbabilities) = 0; /** @brief produces a matrix containing class confidence row-vectors given an collection of images */ diff --git a/modules/text/include/opencv2/text/textDetector.hpp b/modules/text/include/opencv2/text/textDetector.hpp index ea1c7de9d..efbec6bff 100644 --- a/modules/text/include/opencv2/text/textDetector.hpp +++ b/modules/text/include/opencv2/text/textDetector.hpp @@ -65,19 +65,131 @@ namespace text //detection scenario class CV_EXPORTS_W BaseDetector { - public: +public: virtual ~BaseDetector() {}; virtual void run(Mat& image, - std::vector* component_rects=NULL, + std::vector* component_rects=NULL, std::vector* component_confidences=NULL, int component_level=0) = 0; virtual void run(Mat& image, Mat& mask, - std::vector* component_rects=NULL, + std::vector* component_rects=NULL, std::vector* component_confidences=NULL, int component_level=0) = 0; +}; +/** A virtual class for different models of text detection (including CNN based deep models) + */ + +class CV_EXPORTS_W TextRegionDetector +{ +protected: + /** Stores input and output size + */ + //netGeometry inputGeometry_; + //netGeometry outputGeometry_; + Size inputGeometry_; + Size outputGeometry_; + int inputChannelCount_; + int outputChannelCount_; + +public: + virtual ~TextRegionDetector() {} + + /** @brief produces a list of Bounding boxes and an estimate of text-ness confidence of Bounding Boxes + */ + CV_WRAP virtual void detect(InputArray image, OutputArray bboxProb ) = 0; + + + /** @brief simple getter method returning the size (height, width) of the input sample + */ + CV_WRAP virtual Size getInputGeometry(){return this->inputGeometry_;} + + /** @brief simple getter method returning the shape of the oputput + * Any text detector should output a number of text regions alongwith a score of text-ness + * From the shape it can be inferred the number of text regions and number of returned value + * for each region + */ + CV_WRAP virtual Size getOutputGeometry(){return this->outputGeometry_;} + + + +}; + +/** Generic structure of Deep CNN based Text Detectors + * */ +class CV_EXPORTS_W DeepCNNTextDetector : public TextRegionDetector +{ + /** @brief Class that uses a pretrained caffe model for text detection. + * Any text detection should + * This network is described in detail in: + * Minghui Liao et al.: TextBoxes: A Fast Text Detector with a Single Deep Neural Network + * https://arxiv.org/abs/1611.06779 + */ +protected: + /** all deep CNN based text detectors have a preprocessor (normally) + */ + Ptr preprocessor_; + /** @brief all image preprocessing is handled here including whitening etc. + * + * @param input the image to be preprocessed for the classifier. If the depth + * is CV_U8 values should be in [0,255] otherwise values are assumed to be in [0,1] + * + * @param output reference to the image to be fed to the classifier, the preprocessor will + * resize the image to the apropriate size and convert it to the apropriate depth\ + * + * The method preprocess should never be used externally, it is up to classify and classifyBatch + * methods to employ it. + */ + virtual void preprocess(const Mat& input,Mat& output); +public: + virtual ~DeepCNNTextDetector() {}; + + /** @brief Constructs a DeepCNNTextDetector object from a caffe pretrained model + * + * @param archFilename is the path to the prototxt file containing the deployment model architecture description. + * + * @param weightsFilename is the path to the pretrained weights of the model in binary fdorm. + * + * @param preprocessor is a pointer to the instance of a ImagePreprocessor implementing the preprocess_ protecteed method; + * + * @param minibatchSz the maximum number of samples that can processed in parallel. In practice this parameter + * has an effect only when computing in the GPU and should be set with respect to the memory available in the GPU. + * + * @param backEnd integer parameter selecting the coputation framework. For now OCR_HOLISTIC_BACKEND_CAFFE is + * the only option + */ + CV_WRAP static Ptr create(String archFilename,String weightsFilename,Ptr preprocessor,int minibatchSz=100,int backEnd=OCR_HOLISTIC_BACKEND_CAFFE); + + /** @brief Constructs a DeepCNNTextDetector intended to be used for text area detection. + * + * This method loads a pretrained classifier and couples with a preprocessor that preprocess the image with mean subtraction of () + * The architecture and models weights can be downloaded from: + * https://github.com/sghoshcvc/TextBox-Models.git (size is around 100 MB) + + * @param archFilename is the path to the prototxt file containing the deployment model architecture description. + * When employing OCR_HOLISTIC_BACKEND_CAFFE this is the path to the deploy ".prototxt". + * + * @param weightsFilename is the path to the pretrained weights of the model. When employing + * OCR_HOLISTIC_BACKEND_CAFFE this is the path to the ".caffemodel" file. + * + * @param backEnd integer parameter selecting the coputation framework. For now OCR_HOLISTIC_BACKEND_CAFFE is + * the only option + */ + CV_WRAP static Ptr createTextBoxNet(String archFilename,String weightsFilename,int backEnd=OCR_HOLISTIC_BACKEND_CAFFE); + friend class ImagePreprocessor; + +}; + +/** @brief textDetector class provides the functionallity of text bounding box detection. + * A TextRegionDetector is employed to find bounding boxes of text + * words given an input image. + * + * This class implements the logic of providing text bounding boxes in a vector of rects given an TextRegionDetector + * The TextRegionDetector can be any text detector + * + */ class CV_EXPORTS_W textDetector : public BaseDetector { @@ -125,9 +237,9 @@ public: - /** @brief simple getter for the preprocessing functor + /** @brief simple getter for the preprocessing functor */ - CV_WRAP virtual Ptr getClassifier()=0; + CV_WRAP virtual Ptr getClassifier()=0; /** @brief Creates an instance of the textDetector class. @@ -135,7 +247,7 @@ public: */ - CV_WRAP static Ptr create(Ptr classifierPtr); + CV_WRAP static Ptr create(Ptr classifierPtr); /** @brief Creates an instance of the textDetector class and implicitly also a DeepCNN classifier. diff --git a/modules/text/src/ocr_holistic.cpp b/modules/text/src/ocr_holistic.cpp index 9791e62bb..ae73b04dc 100644 --- a/modules/text/src/ocr_holistic.cpp +++ b/modules/text/src/ocr_holistic.cpp @@ -459,53 +459,53 @@ protected: #endif } - void process_(Mat inputImage, Mat &outputMat) - { - // do forward pass and stores the output in outputMat - //Process one image - CV_Assert(this->minibatchSz_==1); - //CV_Assert(outputMat.isContinuous()); +// void process_(Mat inputImage, Mat &outputMat) +// { +// // do forward pass and stores the output in outputMat +// //Process one image +// CV_Assert(this->minibatchSz_==1); +// //CV_Assert(outputMat.isContinuous()); -#ifdef HAVE_CAFFE - net_->input_blobs()[0]->Reshape(1, this->channelCount_,this->inputGeometry_.height,this->inputGeometry_.width); - net_->Reshape(); - float* inputBuffer=net_->input_blobs()[0]->mutable_cpu_data(); - float* inputData=inputBuffer; +//#ifdef HAVE_CAFFE +// net_->input_blobs()[0]->Reshape(1, this->channelCount_,this->inputGeometry_.height,this->inputGeometry_.width); +// net_->Reshape(); +// float* inputBuffer=net_->input_blobs()[0]->mutable_cpu_data(); +// float* inputData=inputBuffer; - std::vector input_channels; - Mat preprocessed; - // if the image have multiple color channels the input layer should be populated accordingly - for (int channel=0;channel < this->channelCount_;channel++){ +// std::vector input_channels; +// Mat preprocessed; +// // if the image have multiple color channels the input layer should be populated accordingly +// for (int channel=0;channel < this->channelCount_;channel++){ - cv::Mat netInputWraped(this->inputGeometry_.height, this->inputGeometry_.width, CV_32FC1, inputData); - input_channels.push_back(netInputWraped); - //input_data += width * height; - inputData+=(this->inputGeometry_.height*this->inputGeometry_.width); - } - this->preprocess(inputImage,preprocessed); - split(preprocessed, input_channels); +// cv::Mat netInputWraped(this->inputGeometry_.height, this->inputGeometry_.width, CV_32FC1, inputData); +// input_channels.push_back(netInputWraped); +// //input_data += width * height; +// inputData+=(this->inputGeometry_.height*this->inputGeometry_.width); +// } +// this->preprocess(inputImage,preprocessed); +// split(preprocessed, input_channels); - //preprocessed.copyTo(netInputWraped); +// //preprocessed.copyTo(netInputWraped); - this->net_->Forward(); - const float* outputNetData=net_->output_blobs()[0]->cpu_data(); - // const float* outputNetData1=net_->output_blobs()[1]->cpu_data(); +// this->net_->Forward(); +// const float* outputNetData=net_->output_blobs()[0]->cpu_data(); +// // const float* outputNetData1=net_->output_blobs()[1]->cpu_data(); - this->outputGeometry_ = Size(net_->output_blobs()[0]->width(),net_->output_blobs()[0]->height()); - int outputSz = this->outputSize_ * this->outputGeometry_.height * this->outputGeometry_.width; - outputMat.create(this->outputGeometry_.height , this->outputGeometry_.width,CV_32FC1); - float*outputMatData=(float*)(outputMat.data); +// this->outputGeometry_ = Size(net_->output_blobs()[0]->width(),net_->output_blobs()[0]->height()); +// int outputSz = this->outputSize_ * this->outputGeometry_.height * this->outputGeometry_.width; +// outputMat.create(this->outputGeometry_.height , this->outputGeometry_.width,CV_32FC1); +// float*outputMatData=(float*)(outputMat.data); - memcpy(outputMatData,outputNetData,sizeof(float)*outputSz); +// memcpy(outputMatData,outputNetData,sizeof(float)*outputSz); -#endif - } +//#endif +// } @@ -587,15 +587,15 @@ public: inputImageList.push_back(image.getMat()); classifyBatch(inputImageList,classProbabilities); } - void detect(InputArray image, OutputArray Bbox_prob) - { +// void detect(InputArray image, OutputArray Bbox_prob) +// { - Bbox_prob.create(this->outputGeometry_,CV_32F); // dummy initialization is it needed - Mat outputMat = Bbox_prob.getMat(); - process_(image.getMat(),outputMat); - //copy back to outputArray - outputMat.copyTo(Bbox_prob); - } +// Bbox_prob.create(this->outputGeometry_,CV_32F); // dummy initialization is it needed +// Mat outputMat = Bbox_prob.getMat(); +// process_(image.getMat(),outputMat); +// //copy back to outputArray +// outputMat.copyTo(Bbox_prob); +// } void classifyBatch(InputArrayOfArrays inputImageList, OutputArray classProbabilities) { diff --git a/modules/text/src/text_detector.cpp b/modules/text/src/text_detector.cpp index 5b18e9708..1b979c253 100644 --- a/modules/text/src/text_detector.cpp +++ b/modules/text/src/text_detector.cpp @@ -23,6 +23,8 @@ namespace cv { namespace text { + + class textDetectImpl: public textDetector{ private: struct NetOutput{ @@ -60,9 +62,9 @@ private: }; protected: - Ptr classifier_; + Ptr classifier_; public: - textDetectImpl(Ptr classifierPtr):classifier_(classifierPtr) + textDetectImpl(Ptr classifierPtr):classifier_(classifierPtr) { } @@ -131,13 +133,13 @@ public: - Ptr getClassifier() + Ptr getClassifier() { return this->classifier_; } }; -Ptr textDetector::create(Ptr classifierPtr) +Ptr textDetector::create(Ptr classifierPtr) { return Ptr(new textDetectImpl(classifierPtr)); } @@ -155,7 +157,7 @@ Ptr textDetector::create(String modelArchFilename, String modelWei textbox_mean.at(0,2)=123; preprocessor->set_mean(textbox_mean); // create a pointer to text box detector(textDetector) - Ptr classifierPtr(DeepCNN::create(modelArchFilename,modelWeightsFilename,preprocessor,1)); + Ptr classifierPtr(DeepCNNTextDetector::create(modelArchFilename,modelWeightsFilename,preprocessor,1)); return Ptr(new textDetectImpl(classifierPtr)); } diff --git a/modules/text/src/text_detectorCNN.cpp b/modules/text/src/text_detectorCNN.cpp new file mode 100644 index 000000000..b48e97e7c --- /dev/null +++ b/modules/text/src/text_detectorCNN.cpp @@ -0,0 +1,343 @@ +#include "precomp.hpp" +#include "opencv2/imgproc.hpp" +#include "opencv2/core.hpp" + + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#ifdef HAVE_CAFFE +#include "caffe/caffe.hpp" +#endif +namespace cv { namespace text { + +inline bool fileExists (String filename) { + std::ifstream f(filename.c_str()); + return f.good(); +} + +//************************************************************************************ +//****************** TextImageClassifier ***************************************** +//************************************************************************************ + +//void TextImageClassifier::preprocess(const Mat& input,Mat& output) +//{ +// this->preprocessor_->preprocess_(input,output,this->inputGeometry_,this->channelCount_); +//} + +//void TextImageClassifier::setPreprocessor(Ptr ptr) +//{ +// CV_Assert(!ptr.empty()); +// preprocessor_=ptr; +//} + +//Ptr TextImageClassifier::getPreprocessor() +//{ +// return preprocessor_; +//} + + +class DeepCNNTextDetectorCaffeImpl: public DeepCNNTextDetector{ +protected: + + + void process_(Mat inputImage, Mat &outputMat) + { + // do forward pass and stores the output in outputMat + //Process one image + // CV_Assert(this->outputGeometry_.batchSize==1); + //CV_Assert(outputMat.isContinuous()); + +#ifdef HAVE_CAFFE + net_->input_blobs()[0]->Reshape(1, this->inputChannelCount_,this->inputGeometry_.height,this->inputGeometry_.width); + net_->Reshape(); + float* inputBuffer=net_->input_blobs()[0]->mutable_cpu_data(); + float* inputData=inputBuffer; + + std::vector input_channels; + Mat preprocessed; + // if the image have multiple color channels the input layer should be populated accordingly + for (int channel=0;channel < this->inputChannelCount_;channel++){ + + cv::Mat netInputWraped(this->inputGeometry_.height, this->inputGeometry_.width, CV_32FC1, inputData); + input_channels.push_back(netInputWraped); + //input_data += width * height; + inputData+=(this->inputGeometry_.height*this->inputGeometry_.width); + } + this->preprocess(inputImage,preprocessed); + split(preprocessed, input_channels); + + //preprocessed.copyTo(netInputWraped); + + + this->net_->Forward(); + const float* outputNetData=net_->output_blobs()[0]->cpu_data(); + // const float* outputNetData1=net_->output_blobs()[1]->cpu_data(); + + + + + this->outputGeometry_.height = net_->output_blobs()[0]->height(); + this->outputGeometry_.width = net_->output_blobs()[0]->width(); + this->outputChannelCount_ = net_->output_blobs()[0]->channels(); + int outputSz = this->outputChannelCount_ * this->outputGeometry_.height * this->outputGeometry_.width; + outputMat.create(this->outputGeometry_.height , this->outputGeometry_.width,CV_32FC1); + float*outputMatData=(float*)(outputMat.data); + + memcpy(outputMatData,outputNetData,sizeof(float)*outputSz); + + + +#endif + } + + + +#ifdef HAVE_CAFFE + Ptr > net_; +#endif + //Size inputGeometry_; + int minibatchSz_;//The existence of the assignment operator mandates this to be nonconst + //int outputSize_; +public: + DeepCNNTextDetectorCaffeImpl(const DeepCNNTextDetectorCaffeImpl& dn): + minibatchSz_(dn.minibatchSz_){ + outputGeometry_=dn.outputGeometry_; + inputGeometry_=dn.inputGeometry_; + //Implemented to supress Visual Studio warning "assignment operator could not be generated" +#ifdef HAVE_CAFFE + this->net_=dn.net_; +#endif + } + DeepCNNTextDetectorCaffeImpl& operator=(const DeepCNNTextDetectorCaffeImpl &dn) + { +#ifdef HAVE_CAFFE + this->net_=dn.net_; +#endif + this->setPreprocessor(dn.preprocessor_); + this->inputGeometry_=dn.inputGeometry_; + this->inputChannelCount_=dn.inputChannelCount_; + this->outputChannelCount_ = dn.outputChannelCount_; + // this->minibatchSz_=dn.minibatchSz_; + //this->outputGeometry_=dn.outputSize_; + this->preprocessor_=dn.preprocessor_; + this->outputGeometry_=dn.outputGeometry_; + return *this; + //Implemented to supress Visual Studio warning "assignment operator could not be generated" + } + + DeepCNNTextDetectorCaffeImpl(String modelArchFilename, String modelWeightsFilename,Ptr preprocessor, int maxMinibatchSz) + :minibatchSz_(maxMinibatchSz) + { + + CV_Assert(this->minibatchSz_>0); + CV_Assert(fileExists(modelArchFilename)); + CV_Assert(fileExists(modelWeightsFilename)); + CV_Assert(!preprocessor.empty()); + this->setPreprocessor(preprocessor); +#ifdef HAVE_CAFFE + this->net_.reset(new caffe::Net(modelArchFilename, caffe::TEST)); + CV_Assert(net_->num_inputs()==1); + CV_Assert(net_->num_outputs()==1); + CV_Assert(this->net_->input_blobs()[0]->channels()==1 + ||this->net_->input_blobs()[0]->channels()==3); + // this->channelCount_=this->net_->input_blobs()[0]->channels(); + + + + this->net_->CopyTrainedLayersFrom(modelWeightsFilename); + + caffe::Blob* inputLayer = this->net_->input_blobs()[0]; + + this->inputGeometry_.height = inputLayer->height(); + this->inputGeometry_.width = inputLayer->width(); + this->inputChannelCount_ = inputLayer->channels(); + //this->inputGeometry_.batchSize =1; + + inputLayer->Reshape(this->minibatchSz_,this->inputChannelCount_,this->inputGeometry_.height, this->inputGeometry_.width); + net_->Reshape(); + this->outputChannelCount_ = net_->output_blobs()[0]->channels(); + //this->outputGeometry_.batchSize =1; + this->outputGeometry_.height =net_->output_blobs()[0]->height(); + this->outputGeometry_.width = net_->output_blobs()[0]->width(); + + + + + +#else + CV_Error(Error::StsError,"Caffe not available during compilation!"); +#endif + } + + + void detect(InputArray image, OutputArray Bbox_prob) + { + Size outSize = Size(this->outputGeometry_.height,outputGeometry_.width); + Bbox_prob.create(outSize,CV_32F); // dummy initialization is it needed + Mat outputMat = Bbox_prob.getMat(); + process_(image.getMat(),outputMat); + //copy back to outputArray + outputMat.copyTo(Bbox_prob); + } + + + + //int getOutputSize() + //{ + // return this->outputSize_; + //} + Size getOutputGeometry() + { + return this->outputGeometry_; + } + Size getinputGeometry() + { + return this->inputGeometry_; + } + + int getMinibatchSize() + { + return this->minibatchSz_; + } + + int getBackend() + { + return OCR_HOLISTIC_BACKEND_CAFFE; + } + void setPreprocessor(Ptr ptr) + { + CV_Assert(!ptr.empty()); + preprocessor_=ptr; + } + + Ptr getPreprocessor() + { + return preprocessor_; + } +}; + + +Ptr DeepCNNTextDetector::create(String archFilename,String weightsFilename,Ptr preprocessor,int minibatchSz,int backEnd) +{ + if(preprocessor.empty()) + { + // create a custom preprocessor with rawval + Ptr preprocessor=ImagePreprocessor::createImageCustomPreprocessor(255); + // set the mean for the preprocessor + + Mat textbox_mean(1,3,CV_8U); + textbox_mean.at(0,0)=104; + textbox_mean.at(0,1)=117; + textbox_mean.at(0,2)=123; + preprocessor->set_mean(textbox_mean); + } + switch(backEnd){ + case OCR_HOLISTIC_BACKEND_CAFFE: + + return Ptr(new DeepCNNTextDetectorCaffeImpl(archFilename, weightsFilename,preprocessor, minibatchSz)); + break; + case OCR_HOLISTIC_BACKEND_NONE: + default: + CV_Error(Error::StsError,"DeepCNN::create backend not implemented"); + return Ptr(); + break; + } + return Ptr(); + +} + + +Ptr DeepCNNTextDetector::createTextBoxNet(String archFilename,String weightsFilename,int backEnd) +{ + + // create a custom preprocessor with rawval + Ptr preprocessor=ImagePreprocessor::createImageCustomPreprocessor(255); + // set the mean for the preprocessor + + Mat textbox_mean(1,3,CV_8U); + textbox_mean.at(0,0)=104; + textbox_mean.at(0,1)=117; + textbox_mean.at(0,2)=123; + preprocessor->set_mean(textbox_mean); + switch(backEnd){ + case OCR_HOLISTIC_BACKEND_CAFFE: + return Ptr(new DeepCNNTextDetectorCaffeImpl(archFilename, weightsFilename,preprocessor, 100)); + break; + case OCR_HOLISTIC_BACKEND_NONE: + default: + CV_Error(Error::StsError,"DeepCNN::create backend not implemented"); + return Ptr(); + break; + } + return Ptr(); + +} + +void DeepCNNTextDetector::preprocess(const Mat& input,Mat& output) +{ + Size inputHtWd = Size(this->inputGeometry_.height,this->inputGeometry_.width); + this->preprocessor_->preprocess(input,output,inputHtWd,this->inputChannelCount_); +} + +//namespace cnn_config{ +//namespace caffe_backend{ + +//#ifdef HAVE_CAFFE + +//bool getCaffeGpuMode() +//{ +// return caffe::Caffe::mode()==caffe::Caffe::GPU; +//} + +//void setCaffeGpuMode(bool useGpu) +//{ +// if(useGpu) +// { +// caffe::Caffe::set_mode(caffe::Caffe::GPU); +// }else +// { +// caffe::Caffe::set_mode(caffe::Caffe::CPU); +// } +//} + +//bool getCaffeAvailable() +//{ +// return true; +//} + +//#else + +//bool getCaffeGpuMode() +//{ +// CV_Error(Error::StsError,"Caffe not available during compilation!"); +// return 0; +//} + +//void setCaffeGpuMode(bool useGpu) +//{ +// CV_Error(Error::StsError,"Caffe not available during compilation!"); +// CV_Assert(useGpu==1);//Compilation directives force +//} + +//bool getCaffeAvailable(){ +// return 0; +//} + +//#endif + +//}//namespace caffe +//}//namespace cnn_config + +} } //namespace text namespace cv +