From 3aa88889aad6b8211c5d40c4fbcfa1e1c36b1459 Mon Sep 17 00:00:00 2001 From: Maksim Shabunin Date: Thu, 28 Sep 2017 19:34:15 +0300 Subject: [PATCH] Reworked HolisticWordspotter to work with dnn module --- modules/text/CMakeLists.txt | 61 +--- modules/text/FindCaffe.cmake | 14 - modules/text/FindGlog.cmake | 10 - modules/text/FindProtobuf.cmake | 10 - modules/text/include/opencv2/text/ocr.hpp | 177 ++---------- modules/text/samples/dictnet_demo.cpp | 89 ++---- modules/text/samples/dictnet_demo.py | 82 ------ modules/text/src/ocr_holistic.cpp | 330 +++++----------------- modules/text/text_config.hpp.in | 6 - 9 files changed, 122 insertions(+), 657 deletions(-) delete mode 100644 modules/text/FindCaffe.cmake delete mode 100755 modules/text/FindGlog.cmake delete mode 100755 modules/text/FindProtobuf.cmake delete mode 100644 modules/text/samples/dictnet_demo.py diff --git a/modules/text/CMakeLists.txt b/modules/text/CMakeLists.txt index 31e23673b..5d0f89f0d 100644 --- a/modules/text/CMakeLists.txt +++ b/modules/text/CMakeLists.txt @@ -1,5 +1,5 @@ set(the_description "Text Detection and Recognition") -ocv_define_module(text opencv_ml opencv_imgproc opencv_core opencv_features2d OPTIONAL opencv_highgui WRAP python java) +ocv_define_module(text opencv_ml opencv_imgproc opencv_core opencv_features2d opencv_dnn OPTIONAL opencv_highgui WRAP python java) if(NOT CMAKE_CROSSCOMPILING OR OPENCV_FIND_TESSERACT) set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/cmake) @@ -22,62 +22,3 @@ ocv_include_directories(${CMAKE_CURRENT_BINARY_DIR}) ocv_add_testdata(samples/ contrib/text FILES_MATCHING PATTERN "*.xml" PATTERN "*.xml.gz" REGEX "scenetext[0-9]+.jpg" ) - - -#Principal source from which adaptation came is the cnn_3dobj module -find_package(Caffe) - -if(Caffe_FOUND) - message(STATUS "Caffe: YES") - set(HAVE_CAFFE 1) -else() - message(STATUS "Caffe: NO") -endif() - -find_package(Protobuf) -if(Protobuf_FOUND) - message(STATUS "Protobuf: YES") - set(HAVE_PROTOBUF 1) -else() - message(STATUS "Protobuf: NO") -endif() - -find_package(Glog) -if(Glog_FOUND) - message(STATUS "Glog: YES") - set(HAVE_GLOG 1) -else() - message(STATUS "Glog: NO") -endif() - -if(HAVE_CAFFE) -message(STATUS "HAVE CAFFE!!!") -configure_file(${CMAKE_CURRENT_SOURCE_DIR}/text_config.hpp.in - ${CMAKE_CURRENT_SOURCE_DIR}/include/opencv2/text_config.hpp @ONLY) - - -include_directories(${CMAKE_CURRENT_BINARY_DIR}) - -if(${Caffe_FOUND}) - - include_directories(${Caffe_INCLUDE_DIR}) - #taken from caffe's cmake - find_package(HDF5 COMPONENTS HL REQUIRED) - include_directories(SYSTEM ${HDF5_INCLUDE_DIRS} ${HDF5_HL_INCLUDE_DIR}) - list(APPEND Caffe_LINKER_LIBS ${HDF5_LIBRARIES}) - find_package(Boost 1.46 REQUIRED COMPONENTS system thread filesystem) - include_directories(SYSTEM ${Boost_INCLUDE_DIR}) - include_directories(SYSTEM /usr/local/cuda-7.5/targets/x86_64-linux/include/) - list(APPEND Caffe_LINKER_LIBS ${Boost_LIBRARIES}) - -endif() - - -if(${Caffe_FOUND}) - #taken from caffe's cmake - target_link_libraries(opencv_text ${Caffe_LIBS} ${Glog_LIBS} ${Protobuf_LIBS} ${HDF5_LIBRARIES} ${Boost_LIBRARIES}) -endif() -endif() - -configure_file(${CMAKE_CURRENT_SOURCE_DIR}/text_config.hpp.in - ${CMAKE_BINARY_DIR}/text_config.hpp @ONLY) \ No newline at end of file diff --git a/modules/text/FindCaffe.cmake b/modules/text/FindCaffe.cmake deleted file mode 100644 index 12948f629..000000000 --- a/modules/text/FindCaffe.cmake +++ /dev/null @@ -1,14 +0,0 @@ -# Caffe package for CNN Triplet training -unset(Caffe_FOUND) - -find_path(Caffe_INCLUDE_DIR NAMES caffe/caffe.hpp caffe/common.hpp caffe/net.hpp caffe/proto/caffe.pb.h caffe/util/io.hpp caffe/vision_layers.hpp - HINTS - /usr/local/include) - -find_library(Caffe_LIBS NAMES caffe - HINTS - /usr/local/lib) - -if(Caffe_LIBS AND Caffe_INCLUDE_DIR) - set(Caffe_FOUND 1) -endif() diff --git a/modules/text/FindGlog.cmake b/modules/text/FindGlog.cmake deleted file mode 100755 index c30e9f4a6..000000000 --- a/modules/text/FindGlog.cmake +++ /dev/null @@ -1,10 +0,0 @@ -#Required for Caffe -unset(Glog_FOUND) - -find_library(Glog_LIBS NAMES glog - HINTS - /usr/local/lib) - -if(Glog_LIBS) - set(Glog_FOUND 1) -endif() diff --git a/modules/text/FindProtobuf.cmake b/modules/text/FindProtobuf.cmake deleted file mode 100755 index 6d0ad56a1..000000000 --- a/modules/text/FindProtobuf.cmake +++ /dev/null @@ -1,10 +0,0 @@ -#Protobuf package required for Caffe -unset(Protobuf_FOUND) - -find_library(Protobuf_LIBS NAMES protobuf - HINTS - /usr/local/lib) - -if(Protobuf_LIBS) - set(Protobuf_FOUND 1) -endif() diff --git a/modules/text/include/opencv2/text/ocr.hpp b/modules/text/include/opencv2/text/ocr.hpp index 45c77f418..645afeaef 100644 --- a/modules/text/include/opencv2/text/ocr.hpp +++ b/modules/text/include/opencv2/text/ocr.hpp @@ -46,10 +46,6 @@ #include #include -#include -#include - - namespace cv { @@ -540,107 +536,24 @@ at each window location. CV_EXPORTS_W Ptr loadOCRBeamSearchClassifierCNN(const String& filename); -//! @} - - - -//Classifiers should provide diferent backends -//For the moment only caffe is implemeted -enum{ - OCR_HOLISTIC_BACKEND_NONE, - OCR_HOLISTIC_BACKEND_CAFFE -}; - - -/** @brief Abstract class that implements the classifcation of text images. - * - * The interface is generic enough to describe any image classifier. And allows - * to take advantage of compouting in batches. While word classifiers are the default - * networks, any image classifers should work. - * - */ -class CV_EXPORTS_W TextImageClassifier -{ -protected: - Size inputSz_; - int channelCount_; - /** @brief all image preprocessing is handled here including whitening etc. - * - * @param input the image to be preprocessed for the classifier. If the depth - * is CV_U8 values should be in [0,255] otherwise values are assumed to be in [0,1] - * - * @param output reference to the image to be fed to the classifier, the preprocessor will - * resize the image to the apropriate size and convert it to the apropriate depth\ - * - * The method preprocess should never be used externally, it is up to classify and classifyBatch - * methods to employ it. - */ - virtual void preprocess(Mat& input,Mat& output)=0; -public: - virtual ~TextImageClassifier() {} - /** @brief produces a class confidence row-vector given an image - */ - CV_WRAP virtual void classify(InputArray image, OutputArray classProbabilities) = 0; - /** @brief produces a matrix containing class confidence row-vectors given an collection of images - */ - CV_WRAP virtual void classifyBatch(InputArrayOfArrays image, OutputArray classProbabilities) = 0; - /** @brief simple getter method returning the size of the oputput row-vector - */ - CV_WRAP virtual int getOutputSize()=0; - /** @brief simple getter method returning the size of the minibatches for this classifier. - * If not applicabe this method should return 1 - */ - CV_WRAP virtual int getMinibatchSize()=0; - /** @brief simple getter method returning a value describing the framework beeing employed to implement the classifier - */ - CV_WRAP virtual int getBackend(){return OCR_HOLISTIC_BACKEND_NONE;} -}; - -class CV_EXPORTS_W DictNet:public TextImageClassifier -{ - /** @brief Class that uses a pretrained caffe model for word classification. - * - * This network is described in detail in: - * Max Jaderberg et al.: Reading Text in the Wild with Convolutional Neural Networks, IJCV 2015 - * http://arxiv.org/abs/1412.1842 - */ -public: - virtual ~DictNet() {}; - - CV_WRAP virtual bool usingGpu()=0; - /** @brief Constructs a DictNet object from a caffe pretrained model - * - * @param archFilename is the path to the prototxt file containing the deployment model architecture description. - * - * @param weightsFilename is the path to the pretrained weights of the model in binary fdorm. This file can be - * very large, up to 2GB. - * - * @param minibatchSz the maximum number of samples that can processed in parallel. In practice this parameter - * has an effect only when computing in the GPU and should be set with respect to the memory available in the GPU. - * - * @param useGpu boolean flag setting GPU or CPU computation - * - * @param backEnd integer parameter selecting the coputation framework. For now OCR_HOLISTIC_BACKEND_CAFFE is - * the only option - */ - CV_WRAP static Ptr create(String archFilename,String weightsFilename,int minibatchSz=100,bool useGpu=0,int backEnd=OCR_HOLISTIC_BACKEND_CAFFE); -}; - - /** @brief OCRHolisticWordRecognizer class provides the functionallity of segmented wordspotting. - * Given a predefined vocabulary , a TextImageClassifier is employed to select the most probable + * Given a predefined vocabulary , a DictNet is employed to select the most probable * word given an input image. * - * This class implements the logic of providing transcriptions given a vocabulary and and an image - * classifer. + * DictNet is described in detail in: + * Max Jaderberg et al.: Reading Text in the Wild with Convolutional Neural Networks, IJCV 2015 + * http://arxiv.org/abs/1412.1842 */ -class CV_EXPORTS_W OCRHolisticWordRecognizer : public BaseOCR +class CV_EXPORTS OCRHolisticWordRecognizer : public BaseOCR { public: - virtual void run(Mat& image, std::string& output_text, std::vector* component_rects=NULL, - std::vector* component_texts=NULL, std::vector* component_confidences=NULL, - int component_level=OCR_LEVEL_WORD)=0; + virtual void run(Mat& image, + std::string& output_text, + std::vector* component_rects = NULL, + std::vector* component_texts = NULL, + std::vector* component_confidences = NULL, + int component_level = OCR_LEVEL_WORD) = 0; /** @brief Recognize text using a segmentation based word-spotting/classifier cnn. @@ -665,68 +578,24 @@ public: @param component_level must be OCR_LEVEL_WORD. */ - - virtual void run(Mat& image, Mat& mask, std::string& output_text, std::vector* component_rects=NULL, - std::vector* component_texts=NULL, std::vector* component_confidences=NULL, - int component_level=OCR_LEVEL_WORD)=0; - - - /** - @brief Method that provides a quick and simple interface to a single word image classifcation - - @param inputImage an image expected to be a CV_U8C1 or CV_U8C3 of any size assumed to contain a single word - - @param transcription an opencv string that will store the detected word transcription - - @param confidence a double that will be updated with the confidence the classifier has for the selected word - */ - CV_WRAP virtual void recogniseImage(InputArray inputImage,CV_OUT String& transcription,CV_OUT double& confidence)=0; - - /** - @brief Method that provides a quick and simple interface to a multiple word image classifcation taking advantage - the classifiers parallel capabilities. - - @param inputImageList an list of images expected to be a CV_U8C1 or CV_U8C3 each image can be of any size and is assumed - to contain a single word. - - @param transcriptions a vector of opencv strings that will store the detected word transcriptions, one for each - input image - - @param confidences a vector of double that will be updated with the confidence the classifier has for each of the - selected words. - */ - CV_WRAP virtual void recogniseImageBatch(InputArrayOfArrays inputImageList,CV_OUT std::vector& transcriptions,CV_OUT std::vector& confidences)=0; - - - /** - @brief simple getted for the vocabulary employed - */ - CV_WRAP virtual const std::vector& getVocabulary()=0; - + virtual void run(Mat& image, + Mat& mask, + std::string& output_text, + std::vector* component_rects = NULL, + std::vector* component_texts = NULL, + std::vector* component_confidences = NULL, + int component_level = OCR_LEVEL_WORD) = 0; /** @brief Creates an instance of the OCRHolisticWordRecognizer class. - - @param classifierPtr an instance of TextImageClassifier, normaly a DictNet instance - @param vocabullaryFilename the relative or absolute path to the file containing all words in the vocabulary. Each text line - in the file is assumed to be a single word. The number of words in the vocabulary must be exactly the same as the outputSize - of the classifier. */ - CV_WRAP static Ptr create(Ptr classifierPtr,String vocabullaryFilename); - /** @brief Creates an instance of the OCRHolisticWordRecognizer class and implicitly also a DictNet classifier. - - @param modelArchFilename the relative or absolute path to the prototxt file describing the classifiers architecture. - @param modelWeightsFilename the relative or absolute path to the file containing the pretrained weights of the model in caffe-binary form. - @param vocabullaryFilename the relative or absolute path to the file containing all words in the vocabulary. Each text line - in the file is assumed to be a single word. The number of words in the vocabulary must be exactly the same as the outputSize - of the classifier. - */ - CV_WRAP static Ptr create(String modelArchFilename, String modelWeightsFilename, String vocabullaryFilename); - + static Ptr create(const std::string &archFilename, + const std::string &weightsFilename, + const std::string &wordsFilename); }; +//! @} -} -} +}} // cv::text:: #endif // _OPENCV_TEXT_OCR_HPP_ diff --git a/modules/text/samples/dictnet_demo.cpp b/modules/text/samples/dictnet_demo.cpp index 0dab59ced..277a1c9be 100644 --- a/modules/text/samples/dictnet_demo.cpp +++ b/modules/text/samples/dictnet_demo.cpp @@ -12,79 +12,50 @@ #include "opencv2/imgproc.hpp" #include -#include #include -#include -#include -inline std::string getHelpStr(std::string progFname){ - std::stringstream out; - out << " Demo of wordspotting CNN for text recognition." << std::endl; - out << " Max Jaderberg et al.: Reading Text in the Wild with Convolutional Neural Networks, IJCV 2015"< ... " << std::endl; - out << " Caffe Model files (dictnet_vgg.caffemodel, dictnet_vgg_deploy.prototxt, dictnet_vgg_labels.txt)"<" << endl; + cout << " Caffe Model files (dictnet_vgg.caffemodel, dictnet_vgg_deploy.prototxt, dictnet_vgg_labels.txt)"< imageList; - for(int imageIdx=2;imageIdx cnn=cv::text::DictNet::create( - "dictnet_vgg_deploy.prototxt","dictnet_vgg.caffemodel",100,USE_GPU); - cv::Ptr wordSpotter= - cv::text::OCRHolisticWordRecognizer::create(cnn,"dictnet_vgg_labels.txt"); + Ptr wordSpotter = OCRHolisticWordRecognizer::create("dictnet_vgg_deploy.prototxt", "dictnet_vgg.caffemodel", "dictnet_vgg_labels.txt"); - std::vector wordList; - std::vector outProbabillities; - wordSpotter->recogniseImageBatch(imageList,wordList,outProbabillities); + std::string word; + vector confs; + wordSpotter->run(image, word, 0, 0, &confs); - std::ofstream out; - out.open(argv[1]); - for(int imgIdx=0;imgIdx #include -#include -#include -#include -#include -#include -#include -#include -#include - -//should this be moved elsewhere? -//In precomp.hpp It doesn't work -#ifdef HAVE_CAFFE -#include "caffe/caffe.hpp" -#endif +using namespace std; namespace cv { namespace text { -//Maybe OpenCV has a routine better suited -inline bool fileExists (String filename) { - std::ifstream f(filename.c_str()); - return f.good(); -} - - -class DictNetCaffeImpl: public DictNet{ -protected: - void preprocess(Mat& input,Mat& output){ - if(input.channels()==3){ - Mat tmpInput; - cvtColor(input,tmpInput,COLOR_BGR2GRAY); - if(input.depth()==CV_8U){ - tmpInput.convertTo(output,CV_32FC1,1/255.0); - }else{//Assuming values are at the desired [0,1] range - tmpInput.convertTo(output, CV_32FC1); - } - }else{ - if(input.channels()==1){ - if(input.depth()==CV_8U){ - input.convertTo(output, CV_32FC1,1/255.0); - }else{//Assuming values are at the desired [0,1] range - input.convertTo(output, CV_32FC1); - } - }else{ - CV_Error(Error::StsError,"Expecting images with either 1 or 3 channels"); - } - } - resize(output,output,this->inputGeometry_); - Scalar dev,mean; - meanStdDev(output,mean,dev); - subtract(output,mean[0],output); - divide(output,(dev[0]/128.0),output); - } - - void classifyMiniBatch(std::vector inputImageList, Mat outputMat){ - //Classifies a list of images containing at most minibatchSz_ images - CV_Assert(int(inputImageList.size())<=this->minibatchSz_); - CV_Assert(outputMat.isContinuous()); -#ifdef HAVE_CAFFE - net_->input_blobs()[0]->Reshape(inputImageList.size(), 1,this->inputGeometry_.height,this->inputGeometry_.width); - net_->Reshape(); - float* inputBuffer=net_->input_blobs()[0]->mutable_cpu_data(); - float* inputData=inputBuffer; - for(size_t imgNum=0;imgNuminputGeometry_.height, this->inputGeometry_.width, CV_32FC1, inputData); - this->preprocess(inputImageList[imgNum],preprocessed); - preprocessed.copyTo(netInputWraped); - inputData+=(this->inputGeometry_.height*this->inputGeometry_.width); - } - this->net_->ForwardPrefilled(); - const float* outputNetData=net_->output_blobs()[0]->cpu_data(); - float*outputMatData=(float*)(outputMat.data); - memcpy(outputMatData,outputNetData,sizeof(float)*this->outputSize_*inputImageList.size()); -#endif - } - -#ifdef HAVE_CAFFE - Ptr > net_; -#endif - Size inputGeometry_; - int minibatchSz_;//The existence of the assignment operator mandates this to be nonconst - bool gpuBackend_;//The existence of the assignment operator mandates this to be nonconst - int outputSize_; -public: - DictNetCaffeImpl(const DictNetCaffeImpl& dn):inputGeometry_(dn.inputGeometry_),minibatchSz_(dn.minibatchSz_), - gpuBackend_(dn.gpuBackend_),outputSize_(dn.outputSize_){ - //Implemented to supress Visual Studio warning "assignment operator could not be generated" -#ifdef HAVE_CAFFE - this->net_=dn.net_; -#endif - } - DictNetCaffeImpl& operator=(const DictNetCaffeImpl &dn){ -#ifdef HAVE_CAFFE - this->net_=dn.net_; -#endif - this->inputGeometry_=dn.inputGeometry_; - this->minibatchSz_=dn.minibatchSz_; - this->gpuBackend_=dn.gpuBackend_; - this->outputSize_=dn.outputSize_; - return *this; - //Implemented to supress Visual Studio warning "assignment operator could not be generated" - } - - DictNetCaffeImpl(String modelArchFilename, String modelWeightsFilename, int maxMinibatchSz, bool useGpu) - :minibatchSz_(maxMinibatchSz), gpuBackend_(useGpu){ - CV_Assert(this->minibatchSz_>0); - CV_Assert(fileExists(modelArchFilename)); - CV_Assert(fileExists(modelWeightsFilename)); -#ifdef HAVE_CAFFE - if(this->gpuBackend_){ - caffe::Caffe::set_mode(caffe::Caffe::GPU); - }else{ - caffe::Caffe::set_mode(caffe::Caffe::CPU); - } - this->net_.reset(new caffe::Net(modelArchFilename, caffe::TEST)); - CV_Assert(net_->num_inputs()==1); - CV_Assert(net_->num_outputs()==1); - CV_Assert(this->net_->input_blobs()[0]->channels()==1); - this->net_->CopyTrainedLayersFrom(modelWeightsFilename); - caffe::Blob* inputLayer = this->net_->input_blobs()[0]; - this->inputGeometry_=Size(inputLayer->width(), inputLayer->height()); - inputLayer->Reshape(this->minibatchSz_,1,this->inputGeometry_.height, this->inputGeometry_.width); - net_->Reshape(); - this->outputSize_=net_->output_blobs()[0]->channels(); - -#else - CV_Error(Error::StsError,"Caffe not available during compilation!"); -#endif - } - - void classify(InputArray image, OutputArray classProbabilities){ - std::vector inputImageList; - inputImageList.push_back(image.getMat()); - classifyBatch(inputImageList,classProbabilities); - } - - void classifyBatch(InputArrayOfArrays inputImageList, OutputArray classProbabilities){ - std::vector allImageVector; - inputImageList.getMatVector(allImageVector); - size_t outputSize=size_t(this->outputSize_);//temporary variable to avoid int to size_t arithmentic - size_t minibatchSize=size_t(this->minibatchSz_);//temporary variable to avoid int to size_t arithmentic - classProbabilities.create(Size(int(outputSize),int(allImageVector.size())),CV_32F); - Mat outputMat = classProbabilities.getMat(); - for(size_t imgNum=0;imgNum(allImageVector.size()-imgNum,minibatchSize); - std::vector::const_iterator from=std::vector::const_iterator(allImageVector.begin()+imgNum); - std::vector::const_iterator to=std::vector::const_iterator(allImageVector.begin()+rangeEnd); - std::vector minibatchInput(from,to); - classifyMiniBatch(minibatchInput,outputMat.rowRange(int(imgNum),int(rangeEnd))); - } - } - - int getOutputSize(){ - return this->outputSize_; - } - int getMinibatchSize(){ - return this->minibatchSz_; - } - bool usingGpu(){ - return this->gpuBackend_; - } - int getBackend(){ - return OCR_HOLISTIC_BACKEND_CAFFE; - } -}; - - -Ptr DictNet::create(String archFilename,String weightsFilename,int minibatchSz,bool useGpu,int backEnd){ - switch(backEnd){ - case OCR_HOLISTIC_BACKEND_CAFFE: - return Ptr(new DictNetCaffeImpl(archFilename, weightsFilename, minibatchSz, useGpu)); - break; - case OCR_HOLISTIC_BACKEND_NONE: - default: - CV_Error(Error::StsError,"DictNet::create backend not implemented"); - return Ptr(); - break; - } -} - - -class OCRHolisticWordRecognizerImpl: public OCRHolisticWordRecognizer{ +class OCRHolisticWordRecognizerImpl : public OCRHolisticWordRecognizer +{ private: - struct NetOutput{ - //Auxiliary structure that handles the logic of getting class ids and probabillities from - //the raw outputs of caffe - int wordIdx; - float probabillity; + dnn::Net net; + vector words; - static bool sorter(const NetOutput& o1,const NetOutput& o2){//used with std::sort to provide the most probable class - return o1.probabillity>o2.probabillity; - } - - static void getOutputs(const float* buffer,int nbOutputs,std::vector& res){ - res.resize(nbOutputs); - for(int k=0;k tmp; - getOutputs(buffer,nbOutputs,tmp); - classNum=tmp[0].wordIdx; - confidence=tmp[0].probabillity; - } - }; -protected: - std::vector labels_; - Ptr classifier_; public: - OCRHolisticWordRecognizerImpl(Ptr classifierPtr,String vocabullaryFilename):classifier_(classifierPtr){ - CV_Assert(fileExists(vocabullaryFilename));//this fails for some rason - std::ifstream labelsFile(vocabullaryFilename.c_str()); - if(!labelsFile){ - CV_Error(Error::StsError,"Could not read Labels from file"); + OCRHolisticWordRecognizerImpl(const string &archFilename, const string &weightsFilename, const string &wordsFilename) + { + net = dnn::readNetFromCaffe(archFilename, weightsFilename); + std::ifstream in(wordsFilename.c_str()); + if (!in) + { + CV_Error(Error::StsError, "Could not read Labels from file"); } std::string line; - while (std::getline(labelsFile, line)){ - labels_.push_back(std::string(line)); - } - CV_Assert(this->classifier_->getOutputSize()==int(this->labels_.size())); - } - - void recogniseImage(InputArray inputImage,CV_OUT String& transcription,CV_OUT double& confidence){ - Mat netOutput; - this->classifier_->classify(inputImage,netOutput); - int classNum; - NetOutput::getClassification((float*)(netOutput.data),this->classifier_->getOutputSize(),classNum,confidence); - transcription=this->labels_[classNum]; + while (std::getline(in, line)) + words.push_back(line); + CV_Assert(getClassCount() == words.size()); } - void recogniseImageBatch(InputArrayOfArrays inputImageList,CV_OUT std::vector& transcriptionVec,CV_OUT std::vector& confidenceVec){ - Mat netOutput; - this->classifier_->classifyBatch(inputImageList,netOutput); - for(int k=0;kclassifier_->getOutputSize(),classNum,confidence); - transcriptionVec.push_back(this->labels_[classNum]); - confidenceVec.push_back(confidence); - } - } - - void run(Mat& image, std::string& output_text, std::vector* component_rects=NULL, - std::vector* component_texts=NULL, std::vector* component_confidences=NULL, - int component_level=0){ - CV_Assert(component_level==OCR_LEVEL_WORD);//Componnents not applicable for word spotting + void run(Mat& image, std::string& output_text, std::vector* component_rects=NULL, std::vector* component_texts=NULL, std::vector* component_confidences=NULL, int component_level=0) + { + CV_Assert(component_level==OCR_LEVEL_WORD); //Componnents not applicable for word spotting double confidence; - String transcription; - recogniseImage(image,transcription,confidence); - output_text=transcription.c_str(); + output_text = classify(image, confidence); if(component_rects!=NULL){ component_rects->resize(1); (*component_rects)[0]=Rect(0,0,image.size().width,image.size().height); } if(component_texts!=NULL){ component_texts->resize(1); - (*component_texts)[0]=transcription.c_str(); + (*component_texts)[0] = output_text; } if(component_confidences!=NULL){ component_confidences->resize(1); - (*component_confidences)[0]=float(confidence); + (*component_confidences)[0] = float(confidence); } } - void run(Mat& image, Mat& mask, std::string& output_text, std::vector* component_rects=NULL, - std::vector* component_texts=NULL, std::vector* component_confidences=NULL, - int component_level=0){ - CV_Assert(mask.cols==image.cols && mask.rows== image.rows);//Mask is ignored because the CNN operates on a full image - this->run(image,output_text,component_rects,component_texts,component_confidences,component_level); + + void run(Mat& image, Mat& mask, std::string& output_text, std::vector* component_rects=NULL, std::vector* component_texts=NULL, std::vector* component_confidences=NULL, int component_level=0) + { + //Mask is ignored because the CNN operates on a full image + CV_Assert(mask.cols == image.cols && mask.rows == image.rows); + this->run(image, output_text, component_rects, component_texts, component_confidences, component_level); } - std::vector& getVocabulary(){ - return this->labels_; + +protected: + Size getPerceptiveField() const + { + return Size(100, 32); + } + + size_t getClassCount() + { + int id = net.getLayerId("prob"); + dnn::MatShape inputShape; + inputShape.push_back(1); + inputShape.push_back(1); + inputShape.push_back(getPerceptiveField().height); + inputShape.push_back(getPerceptiveField().width); + vector inShapes, outShapes; + net.getLayerShapes(inputShape, id, inShapes, outShapes); + CV_Assert(outShapes.size() == 1 && outShapes[0].size() == 4); + CV_Assert(outShapes[0][0] == 1 && outShapes[0][2] == 1 && outShapes[0][3] == 1); + return outShapes[0][1]; + } + + string classify(InputArray image, double & conf) + { + CV_Assert(image.channels() == 1 && image.depth() == CV_8U); + Mat resized; + resize(image, resized, getPerceptiveField()); + Mat blob = dnn::blobFromImage(resized); + net.setInput(blob, "data"); + Mat prob = net.forward("prob"); + CV_Assert(prob.dims == 4 && !prob.empty() && prob.size[1] == (int)getClassCount()); + int idx[4] = {0}; + minMaxIdx(prob, 0, &conf, 0, idx); + CV_Assert(0 <= idx[1] && idx[1] < (int)words.size()); + return words[idx[1]]; } -}; -Ptr OCRHolisticWordRecognizer::create(Ptr classifierPtr,String vocabullaryFilename ){ - return Ptr(new OCRHolisticWordRecognizerImpl(classifierPtr,vocabullaryFilename)); -} +}; -Ptr OCRHolisticWordRecognizer::create(String modelArchFilename, String modelWeightsFilename, String vocabullaryFilename){ - Ptr classifierPtr(new DictNetCaffeImpl(modelArchFilename,modelWeightsFilename, 100,0)); - return Ptr(new OCRHolisticWordRecognizerImpl(classifierPtr,vocabullaryFilename)); +Ptr OCRHolisticWordRecognizer::create(const string &archFilename, const string &weightsFilename, const string &wordsFilename) +{ + return makePtr(archFilename, weightsFilename, wordsFilename); } -} } //namespace text namespace cv +}} // cv::text:: diff --git a/modules/text/text_config.hpp.in b/modules/text/text_config.hpp.in index 2e64f3bfb..ec5120a41 100644 --- a/modules/text/text_config.hpp.in +++ b/modules/text/text_config.hpp.in @@ -1,13 +1,7 @@ #ifndef __OPENCV_TEXT_CONFIG_HPP__ #define __OPENCV_TEXT_CONFIG_HPP__ -// HAVE CAFFE -#cmakedefine HAVE_CAFFE - // HAVE OCR Tesseract #cmakedefine HAVE_TESSERACT - - - #endif