Reworked HolisticWordspotter to work with dnn module

7 years ago · 3aa88889aa
parent bad02f3797
commit 3aa88889aa
9 changed files with 122 additions and 657 deletions
--- a/modules/text/CMakeLists.txt
+++ b/modules/text/CMakeLists.txt
@ -1,5 +1,5 @@
 set(the_description "Text Detection and Recognition")
-ocv_define_module(text opencv_ml opencv_imgproc opencv_core opencv_features2d OPTIONAL opencv_highgui WRAP python java)
+ocv_define_module(text opencv_ml opencv_imgproc opencv_core opencv_features2d opencv_dnn OPTIONAL opencv_highgui WRAP python java)
 if(NOT CMAKE_CROSSCOMPILING OR OPENCV_FIND_TESSERACT)
  set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
@ -22,62 +22,3 @@ ocv_include_directories(${CMAKE_CURRENT_BINARY_DIR})
 ocv_add_testdata(samples/ contrib/text
    FILES_MATCHING PATTERN "*.xml" PATTERN "*.xml.gz" REGEX "scenetext[0-9]+.jpg"
 )
 #Principal source from which adaptation came is the cnn_3dobj module
 find_package(Caffe)
 if(Caffe_FOUND)
  message(STATUS "Caffe:   YES")
  set(HAVE_CAFFE 1)
 else()
  message(STATUS "Caffe:   NO")
 endif()
 find_package(Protobuf)
 if(Protobuf_FOUND)
  message(STATUS "Protobuf:   YES")
  set(HAVE_PROTOBUF 1)
 else()
  message(STATUS "Protobuf:   NO")
 endif()
 find_package(Glog)
 if(Glog_FOUND)
  message(STATUS "Glog:   YES")
  set(HAVE_GLOG 1)
 else()
  message(STATUS "Glog:   NO")
 endif()
 if(HAVE_CAFFE)
 message(STATUS "HAVE CAFFE!!!")
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/text_config.hpp.in
               ${CMAKE_CURRENT_SOURCE_DIR}/include/opencv2/text_config.hpp @ONLY)
 include_directories(${CMAKE_CURRENT_BINARY_DIR})
 if(${Caffe_FOUND})
  include_directories(${Caffe_INCLUDE_DIR})
  #taken from caffe's cmake
  find_package(HDF5 COMPONENTS HL REQUIRED)
  include_directories(SYSTEM ${HDF5_INCLUDE_DIRS} ${HDF5_HL_INCLUDE_DIR})
  list(APPEND Caffe_LINKER_LIBS ${HDF5_LIBRARIES})
  find_package(Boost 1.46 REQUIRED COMPONENTS system thread filesystem)
  include_directories(SYSTEM ${Boost_INCLUDE_DIR})
  include_directories(SYSTEM /usr/local/cuda-7.5/targets/x86_64-linux/include/)
  list(APPEND Caffe_LINKER_LIBS ${Boost_LIBRARIES})
 endif()
 if(${Caffe_FOUND})
  #taken from caffe's cmake
  target_link_libraries(opencv_text ${Caffe_LIBS} ${Glog_LIBS} ${Protobuf_LIBS} ${HDF5_LIBRARIES} ${Boost_LIBRARIES})
 endif()
 endif()
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/text_config.hpp.in
               ${CMAKE_BINARY_DIR}/text_config.hpp @ONLY)
--- a/modules/text/FindCaffe.cmake
+++ b/modules/text/FindCaffe.cmake
@ -1,14 +0,0 @@
 # Caffe package for CNN Triplet training
 unset(Caffe_FOUND)
 find_path(Caffe_INCLUDE_DIR NAMES caffe/caffe.hpp caffe/common.hpp caffe/net.hpp caffe/proto/caffe.pb.h caffe/util/io.hpp caffe/vision_layers.hpp
  HINTS
  /usr/local/include)
 find_library(Caffe_LIBS NAMES caffe
  HINTS
  /usr/local/lib)
 if(Caffe_LIBS AND Caffe_INCLUDE_DIR)
    set(Caffe_FOUND 1)
 endif()
--- a/modules/text/FindGlog.cmake
+++ b/modules/text/FindGlog.cmake
@ -1,10 +0,0 @@
 #Required for Caffe
 unset(Glog_FOUND)
 find_library(Glog_LIBS NAMES glog
  HINTS
  /usr/local/lib)
 if(Glog_LIBS)
    set(Glog_FOUND 1)
 endif()
--- a/modules/text/FindProtobuf.cmake
+++ b/modules/text/FindProtobuf.cmake
@ -1,10 +0,0 @@
 #Protobuf package required for Caffe
 unset(Protobuf_FOUND)
 find_library(Protobuf_LIBS NAMES protobuf
  HINTS
  /usr/local/lib)
 if(Protobuf_LIBS)
    set(Protobuf_FOUND 1)
 endif()
--- a/modules/text/include/opencv2/text/ocr.hpp
+++ b/modules/text/include/opencv2/text/ocr.hpp
@ -46,10 +46,6 @@
 #include <vector>
 #include <string>
 #include <iostream>
 #include <sstream>
 namespace cv
 {
@ -540,107 +536,24 @@ at each window location.
 CV_EXPORTS_W Ptr<OCRBeamSearchDecoder::ClassifierCallback> loadOCRBeamSearchClassifierCNN(const String& filename);
 //! @}
 //Classifiers should provide diferent backends
 //For the moment only caffe is implemeted
 enum{
    OCR_HOLISTIC_BACKEND_NONE,
    OCR_HOLISTIC_BACKEND_CAFFE
 };
 /** @brief Abstract class that implements the classifcation of text images.
 *
 * The interface is generic enough to describe any image classifier. And allows
 * to take advantage of compouting in batches. While word classifiers are the default
 * networks, any image classifers should work.
 *
 */
 class CV_EXPORTS_W TextImageClassifier
 {
 protected:
    Size inputSz_;
    int channelCount_;
    /** @brief all image preprocessing is handled here including whitening etc.
     *
     *  @param input the image to be preprocessed for the classifier. If the depth
     * is CV_U8 values should be in [0,255] otherwise values are assumed to be in [0,1]
     *
     * @param output reference to the image to be fed to the classifier, the preprocessor will
     * resize the image to the apropriate size and convert it to the apropriate depth\
     *
     * The method preprocess should never be used externally, it is up to classify and classifyBatch
     * methods to employ it.
     */
    virtual void preprocess(Mat& input,Mat& output)=0;
 public:
    virtual ~TextImageClassifier() {}
    /** @brief produces a class confidence row-vector given an image
     */
    CV_WRAP virtual void classify(InputArray image, OutputArray classProbabilities) = 0;
    /** @brief produces a matrix containing class confidence row-vectors given an collection of images
     */
    CV_WRAP virtual void classifyBatch(InputArrayOfArrays image, OutputArray classProbabilities) = 0;
    /** @brief simple getter method returning the size of the oputput row-vector
     */
    CV_WRAP virtual int getOutputSize()=0;
    /** @brief simple getter method returning the size of the minibatches for this classifier.
     * If not applicabe this method should return 1
     */
    CV_WRAP virtual int getMinibatchSize()=0;
    /** @brief simple getter method returning a value describing the framework beeing employed to implement the classifier
     */
    CV_WRAP virtual int getBackend(){return OCR_HOLISTIC_BACKEND_NONE;}
 };
 class CV_EXPORTS_W DictNet:public TextImageClassifier
 {
    /** @brief Class that uses a pretrained caffe model for word classification.
     *
     * This network is described in detail in:
     * Max Jaderberg et al.: Reading Text in the Wild with Convolutional Neural Networks, IJCV 2015
     * http://arxiv.org/abs/1412.1842
     */
 public:
    virtual ~DictNet() {};
    CV_WRAP virtual bool usingGpu()=0;
    /** @brief Constructs a DictNet object from a caffe pretrained model
     *
     * @param archFilename is the path to the prototxt file containing the deployment model architecture description.
     *
     * @param weightsFilename is the path to the pretrained weights of the model in binary fdorm. This file can be
     * very large, up to 2GB.
     *
     * @param minibatchSz the maximum number of samples that can processed in parallel. In practice this parameter
     * has an effect only when computing in the GPU and should be set with respect to the memory available in the GPU.
     *
     * @param useGpu boolean flag setting GPU or CPU computation
     *
     * @param backEnd integer parameter selecting the coputation framework. For now OCR_HOLISTIC_BACKEND_CAFFE is
     * the only option
     */
    CV_WRAP static Ptr<DictNet> create(String archFilename,String weightsFilename,int minibatchSz=100,bool useGpu=0,int backEnd=OCR_HOLISTIC_BACKEND_CAFFE);
 };
 /** @brief OCRHolisticWordRecognizer class provides the functionallity of segmented wordspotting.
- * Given a predefined vocabulary , a TextImageClassifier is employed to select the most probable
+ * Given a predefined vocabulary , a DictNet is employed to select the most probable
 * word given an input image.
 *
- * This class implements the logic of providing transcriptions given a vocabulary and and an image
+ * DictNet is described in detail in:
- * classifer.
+ * Max Jaderberg et al.: Reading Text in the Wild with Convolutional Neural Networks, IJCV 2015
 * http://arxiv.org/abs/1412.1842
 */
-class CV_EXPORTS_W OCRHolisticWordRecognizer : public BaseOCR
+class CV_EXPORTS OCRHolisticWordRecognizer : public BaseOCR
 {
 public:
-    virtual void run(Mat& image, std::string& output_text, std::vector<Rect>* component_rects=NULL,
+    virtual void run(Mat& image,
-                     std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
+                     std::string& output_text,
-                     int component_level=OCR_LEVEL_WORD)=0;
+                     std::vector<Rect>* component_rects = NULL,
                     std::vector<std::string>* component_texts = NULL,
                     std::vector<float>* component_confidences = NULL,
                     int component_level = OCR_LEVEL_WORD) = 0;
    /** @brief Recognize text using a segmentation based word-spotting/classifier cnn.
@ -665,68 +578,24 @@ public:
    @param component_level must be OCR_LEVEL_WORD.
     */
-
+    virtual void run(Mat& image,
-    virtual void run(Mat& image, Mat& mask, std::string& output_text, std::vector<Rect>* component_rects=NULL,
+                     Mat& mask,
-                     std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
+                     std::string& output_text,
-                     int component_level=OCR_LEVEL_WORD)=0;
+                     std::vector<Rect>* component_rects = NULL,
-
+                     std::vector<std::string>* component_texts = NULL,
-
+                     std::vector<float>* component_confidences = NULL,
-    /**
+                     int component_level = OCR_LEVEL_WORD) = 0;
    @brief Method that provides a quick and simple interface to a single word image classifcation
    @param inputImage an image expected to be a CV_U8C1 or CV_U8C3 of any size assumed to contain a single word
    @param transcription an opencv string that will store the detected word transcription
    @param confidence a double that will be updated with the confidence the classifier has for the selected word
    */
    CV_WRAP virtual void recogniseImage(InputArray inputImage,CV_OUT String& transcription,CV_OUT double& confidence)=0;
    /**
    @brief Method that provides a quick and simple interface to a multiple word image classifcation taking advantage
    the classifiers parallel capabilities.
    @param inputImageList an list of images expected to be a CV_U8C1 or CV_U8C3 each image can be of any size and is assumed
    to contain a single word.
    @param transcriptions a vector of opencv strings that will store the detected word transcriptions, one for each
    input image
    @param confidences a vector of double that will be updated with the confidence the classifier has for each of the
    selected words.
    */
    CV_WRAP virtual void recogniseImageBatch(InputArrayOfArrays inputImageList,CV_OUT std::vector<String>& transcriptions,CV_OUT std::vector<double>& confidences)=0;
    /**
    @brief simple getted for the vocabulary employed
    */
    CV_WRAP virtual const std::vector<String>& getVocabulary()=0;
    /** @brief Creates an instance of the OCRHolisticWordRecognizer class.
    @param classifierPtr an instance of TextImageClassifier, normaly a DictNet instance
    @param vocabullaryFilename the relative or absolute path to the file containing all words in the vocabulary. Each text line
    in the file is assumed to be a single word. The number of words in the vocabulary must be exactly the same as the outputSize
    of the classifier.
     */
-    CV_WRAP static Ptr<OCRHolisticWordRecognizer> create(Ptr<TextImageClassifier> classifierPtr,String vocabullaryFilename);
+    static Ptr<OCRHolisticWordRecognizer> create(const std::string &archFilename,
-    /** @brief Creates an instance of the OCRHolisticWordRecognizer class and implicitly also a DictNet classifier.
+                                                 const std::string &weightsFilename,
-
+                                                 const std::string &wordsFilename);
    @param modelArchFilename the relative or absolute path to the prototxt file describing the classifiers architecture.
    @param modelWeightsFilename the relative or absolute path to the file containing the pretrained weights of the model in caffe-binary form.
    @param vocabullaryFilename the relative or absolute path to the file containing all words in the vocabulary. Each text line
    in the file is assumed to be a single word. The number of words in the vocabulary must be exactly the same as the outputSize
    of the classifier.
    */
    CV_WRAP static Ptr<OCRHolisticWordRecognizer> create(String modelArchFilename, String modelWeightsFilename, String vocabullaryFilename);
 };
 //! @}
-}
+}} // cv::text::
 }
 #endif // _OPENCV_TEXT_OCR_HPP_
--- a/modules/text/samples/dictnet_demo.cpp
+++ b/modules/text/samples/dictnet_demo.cpp
@ -12,79 +12,50 @@
 #include  "opencv2/imgproc.hpp"
 #include  <sstream>
 #include  <vector>
 #include  <iostream>
 #include  <iomanip>
 #include  <fstream>
-inline std::string getHelpStr(std::string progFname){
+using namespace std;
-    std::stringstream out;
+using namespace cv;
-    out << "    Demo of wordspotting CNN for text recognition." << std::endl;
+using namespace cv::text;
    out << "    Max Jaderberg et al.: Reading Text in the Wild with Convolutional Neural Networks, IJCV 2015"<<std::endl<<std::endl;
-    out << "    Usage: " << progFname << " <output_file> <input_image1> <input_image2> ... <input_imageN>" << std::endl;
+inline void printHelp()
-    out << "    Caffe Model files  (dictnet_vgg.caffemodel, dictnet_vgg_deploy.prototxt, dictnet_vgg_labels.txt)"<<std::endl;
+{
-    out << "      must be in the current directory." << std::endl << std::endl;
+    cout << "    Demo of wordspotting CNN for text recognition." << endl;
    cout << "    Max Jaderberg et al.: Reading Text in the Wild with Convolutional Neural Networks, IJCV 2015"<<std::endl<<std::endl;
-    out << "    Obtaining Caffe Model files in linux shell:"<<std::endl;
+    cout << "    Usage: program <input_image>" << endl;
-    out << "    wget http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg.caffemodel"<<std::endl;
+    cout << "    Caffe Model files  (dictnet_vgg.caffemodel, dictnet_vgg_deploy.prototxt, dictnet_vgg_labels.txt)"<<endl;
-    out << "    wget http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg_deploy.prototxt"<<std::endl;
+    cout << "      must be in the current directory." << endl << endl;
    out << "    wget http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg_labels.txt"<<std::endl<<std::endl;
    return out.str();
 }
-inline bool fileExists (std::string filename) {
+    cout << "    Obtaining Caffe Model files in linux shell:"<<endl;
-    std::ifstream f(filename.c_str());
+    cout << "    wget http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg.caffemodel"<<endl;
-    return f.good();
+    cout << "    wget http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg_deploy.prototxt"<<endl;
    cout << "    wget http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg_labels.txt"<<endl<<endl;
 }
-
+int main(int argc, const char * argv[])
-int main(int argc, const char * argv[]){
+{
-    const int USE_GPU=0;
+    if (argc != 2)
-
+    {
-    if (argc < 3){
+        printHelp();
        std::cout<<getHelpStr(argv[0]);
        std::cout<<"Insufiecient parameters. Aborting!"<<std::endl;
        exit(1);
    }
-    if (!fileExists("dictnet_vgg.caffemodel") ||
+    Mat image = imread(argv[1], IMREAD_GRAYSCALE);
            !fileExists("dictnet_vgg_deploy.prototxt") ||
            !fileExists("dictnet_vgg_labels.txt")){
        std::cout<<getHelpStr(argv[0]);
        std::cout<<"Model files not found in the current directory. Aborting!"<<std::endl;
        exit(1);
    }
-    if (fileExists(argv[1])){
+    cout << "Read image (" << argv[1] << "): " << image.size << ", channels: " << image.channels() << ", depth: " << image.depth() << endl;
        std::cout<<getHelpStr(argv[0]);
        std::cout<<"Output file must not exist. Aborting!"<<std::endl;
        exit(1);
    }
-    std::vector<cv::Mat> imageList;
+    if (image.empty())
-    for(int imageIdx=2;imageIdx<argc;imageIdx++){
+    {
-        if (fileExists(argv[imageIdx])){
+        printHelp();
-            imageList.push_back(cv::imread(cv::String(argv[imageIdx])));
+        exit(1);
        }else{
            std::cout<<getHelpStr(argv[0]);
            std::cout<<argv[imageIdx]<<" doesn't exist. Aborting";
        }
    }
    cv::Ptr<cv::text::DictNet> cnn=cv::text::DictNet::create(
                "dictnet_vgg_deploy.prototxt","dictnet_vgg.caffemodel",100,USE_GPU);
-    cv::Ptr<cv::text::OCRHolisticWordRecognizer> wordSpotter=
+    Ptr<OCRHolisticWordRecognizer> wordSpotter = OCRHolisticWordRecognizer::create("dictnet_vgg_deploy.prototxt", "dictnet_vgg.caffemodel", "dictnet_vgg_labels.txt");
            cv::text::OCRHolisticWordRecognizer::create(cnn,"dictnet_vgg_labels.txt");
-    std::vector<cv::String> wordList;
+    std::string word;
-    std::vector<double> outProbabillities;
+    vector<float> confs;
-    wordSpotter->recogniseImageBatch(imageList,wordList,outProbabillities);
+    wordSpotter->run(image, word, 0, 0, &confs);
-    std::ofstream out;
+    cout << "Detected word: '" << word << "', confidence: " << confs[0] << endl;
    out.open(argv[1]);
    for(int imgIdx=0;imgIdx<int(imageList.size());imgIdx++){
        out<<argv[imgIdx+2]<<","<<wordList[imgIdx]<<","<<outProbabillities[imgIdx]<<std::endl;
    }
    out.close();
 }
--- a/modules/text/samples/dictnet_demo.py
+++ b/modules/text/samples/dictnet_demo.py
@ -1,82 +0,0 @@
 #!/usr/bin/env python
 import cv2
 import sys
 import os.path
 #Global variable shared between the Mouse callback and main
 refPt = []
 cropping = False
 image=None
 drawImage=None
 dictNet=None
 wordSpotter=None
 def mouseCallback(event, x, y, flags, param):
    # grab references to the global variables
    global refPt, cropping,wordSpotter,drawImage,image
    # if the left mouse button was clicked, record the starting
    # (x, y) coordinates and indicate that cropping is being
    # performed
    if event == cv2.EVENT_LBUTTONDOWN:
        refPt = [(x, y)]
        cropping = True
    # check to see if the left mouse button was released
    elif event == cv2.EVENT_LBUTTONUP:
        # record the ending (x, y) coordinates and indicate that
        # the cropping operation is finished
        refPt.append((x, y))
        cropping = False
        # draw a rectangle around the region of interest
        roi = image[refPt[0][1]:refPt[1][1], refPt[0][0]:refPt[1][0]]
        res=wordSpotter.recogniseImage(roi)
        drawImage = image.copy()
        cv2.rectangle(drawImage, refPt[0], refPt[1], (0, 255, 0), 2)
        cv2.putText(drawImage,"%s:%f"%(res[0],res[1]),refPt[0],cv2.FONT_HERSHEY_SIMPLEX,1,(255,255,255),2)
        cv2.imshow("Select A Region", drawImage)
 if __name__=='__main__':
    USEGPU=False
    helpStr="""Usage: """+sys.argv[0]+""" IMAGE_FILENAME
    Press 'q' or 'Q' exit
    The modelFiles must be available in the current directory.
    In linux shell they can be downloaded (~2GB) with the following commands:
    wget http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg.caffemodel
    wget http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg_deploy.prototxt
    wget http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg_labels.txt
    """
    if((len(sys.argv)!=2 )or not(os.path.isfile(sys.argv[1]) )):
        print helpStr
        print 'No image file given Aborting!'
        sys.exit(1)
    if not (os.path.isfile('dictnet_vgg_deploy.prototxt') and
        os.path.isfile('dictnet_vgg.caffemodel') and
        os.path.isfile('dictnet_vgg_labels.txt')):
        print helpStr
        print 'Model files not present, Aborting!'
        sys.exit(1)
    dictNet=cv2.text.DictNet_create('./dictnet_vgg_deploy.prototxt','./dictnet_vgg.caffemodel',100,USEGPU)
    wordSpotter=cv2.text.OCRHolisticWordRecognizer_create(dictNet,"./dictnet_vgg_labels.txt")
    image = cv2.imread(sys.argv[1])
    drawImage = image.copy()
    cv2.namedWindow("Select A Region")
    cv2.setMouseCallback("Select A Region", mouseCallback)
    while True:
        cv2.imshow("Select A Region", drawImage)
        key = cv2.waitKey(1) & 0xFF
        # if the 'q' key is pressed, break from the loop
        if key == ord("q") or key == ord("Q"):
            break
    cv2.destroyAllWindows()
--- a/modules/text/src/ocr_holistic.cpp
+++ b/modules/text/src/ocr_holistic.cpp
@ -1,296 +1,102 @@
 #include "precomp.hpp"
 #include "opencv2/imgproc.hpp"
 #include "opencv2/core.hpp"
 #include "opencv2/dnn.hpp"
 #include <iostream>
 #include <fstream>
 #include <sstream>
 #include <queue>
 #include <algorithm>
 #include <iosfwd>
 #include <memory>
 #include <string>
 #include <utility>
 #include <vector>
 //should this be moved elsewhere?
 //In precomp.hpp It doesn't work
 #ifdef HAVE_CAFFE
 #include "caffe/caffe.hpp"
 #endif
 using namespace std;
 namespace cv { namespace text {
-//Maybe OpenCV has a routine better suited
+class OCRHolisticWordRecognizerImpl : public OCRHolisticWordRecognizer
-inline bool fileExists (String filename) {
+{
    std::ifstream f(filename.c_str());
    return f.good();
 }
 class DictNetCaffeImpl: public DictNet{
 protected:
    void preprocess(Mat& input,Mat& output){
        if(input.channels()==3){
            Mat tmpInput;
            cvtColor(input,tmpInput,COLOR_BGR2GRAY);
            if(input.depth()==CV_8U){
                tmpInput.convertTo(output,CV_32FC1,1/255.0);
            }else{//Assuming values are at the desired [0,1] range
                tmpInput.convertTo(output, CV_32FC1);
            }
        }else{
            if(input.channels()==1){
                if(input.depth()==CV_8U){
                    input.convertTo(output, CV_32FC1,1/255.0);
                }else{//Assuming values are at the desired [0,1] range
                    input.convertTo(output, CV_32FC1);
                }
            }else{
                CV_Error(Error::StsError,"Expecting images with either 1 or 3 channels");
            }
        }
        resize(output,output,this->inputGeometry_);
        Scalar dev,mean;
        meanStdDev(output,mean,dev);
        subtract(output,mean[0],output);
        divide(output,(dev[0]/128.0),output);
    }
    void classifyMiniBatch(std::vector<Mat> inputImageList, Mat outputMat){
        //Classifies a list of images containing at most minibatchSz_ images
        CV_Assert(int(inputImageList.size())<=this->minibatchSz_);
        CV_Assert(outputMat.isContinuous());
 #ifdef HAVE_CAFFE
        net_->input_blobs()[0]->Reshape(inputImageList.size(), 1,this->inputGeometry_.height,this->inputGeometry_.width);
        net_->Reshape();
        float* inputBuffer=net_->input_blobs()[0]->mutable_cpu_data();
        float* inputData=inputBuffer;
        for(size_t imgNum=0;imgNum<inputImageList.size();imgNum++){
            Mat preprocessed;
            cv::Mat netInputWraped(this->inputGeometry_.height, this->inputGeometry_.width, CV_32FC1, inputData);
            this->preprocess(inputImageList[imgNum],preprocessed);
            preprocessed.copyTo(netInputWraped);
            inputData+=(this->inputGeometry_.height*this->inputGeometry_.width);
        }
        this->net_->ForwardPrefilled();
        const float* outputNetData=net_->output_blobs()[0]->cpu_data();
        float*outputMatData=(float*)(outputMat.data);
        memcpy(outputMatData,outputNetData,sizeof(float)*this->outputSize_*inputImageList.size());
 #endif
    }
 #ifdef HAVE_CAFFE
    Ptr<caffe::Net<float> > net_;
 #endif
    Size inputGeometry_;
    int minibatchSz_;//The existence of the assignment operator mandates this to be nonconst
    bool gpuBackend_;//The existence of the assignment operator mandates this to be nonconst
    int outputSize_;
 public:
    DictNetCaffeImpl(const DictNetCaffeImpl& dn):inputGeometry_(dn.inputGeometry_),minibatchSz_(dn.minibatchSz_),
        gpuBackend_(dn.gpuBackend_),outputSize_(dn.outputSize_){
        //Implemented to supress Visual Studio warning "assignment operator could not be generated"
 #ifdef HAVE_CAFFE
        this->net_=dn.net_;
 #endif
    }
    DictNetCaffeImpl& operator=(const DictNetCaffeImpl &dn){
 #ifdef HAVE_CAFFE
        this->net_=dn.net_;
 #endif
        this->inputGeometry_=dn.inputGeometry_;
        this->minibatchSz_=dn.minibatchSz_;
        this->gpuBackend_=dn.gpuBackend_;
        this->outputSize_=dn.outputSize_;
        return *this;
        //Implemented to supress Visual Studio warning "assignment operator could not be generated"
    }
    DictNetCaffeImpl(String modelArchFilename, String modelWeightsFilename, int maxMinibatchSz, bool useGpu)
        :minibatchSz_(maxMinibatchSz), gpuBackend_(useGpu){
        CV_Assert(this->minibatchSz_>0);
        CV_Assert(fileExists(modelArchFilename));
        CV_Assert(fileExists(modelWeightsFilename));
 #ifdef HAVE_CAFFE
        if(this->gpuBackend_){
            caffe::Caffe::set_mode(caffe::Caffe::GPU);
        }else{
            caffe::Caffe::set_mode(caffe::Caffe::CPU);
        }
        this->net_.reset(new caffe::Net<float>(modelArchFilename, caffe::TEST));
        CV_Assert(net_->num_inputs()==1);
        CV_Assert(net_->num_outputs()==1);
        CV_Assert(this->net_->input_blobs()[0]->channels()==1);
        this->net_->CopyTrainedLayersFrom(modelWeightsFilename);
        caffe::Blob<float>* inputLayer = this->net_->input_blobs()[0];
        this->inputGeometry_=Size(inputLayer->width(), inputLayer->height());
        inputLayer->Reshape(this->minibatchSz_,1,this->inputGeometry_.height, this->inputGeometry_.width);
        net_->Reshape();
        this->outputSize_=net_->output_blobs()[0]->channels();
 #else
        CV_Error(Error::StsError,"Caffe not available during compilation!");
 #endif
    }
    void classify(InputArray image, OutputArray classProbabilities){
        std::vector<Mat> inputImageList;
        inputImageList.push_back(image.getMat());
        classifyBatch(inputImageList,classProbabilities);
    }
    void classifyBatch(InputArrayOfArrays inputImageList, OutputArray classProbabilities){
        std::vector<Mat> allImageVector;
        inputImageList.getMatVector(allImageVector);
        size_t outputSize=size_t(this->outputSize_);//temporary variable to avoid int to size_t arithmentic
        size_t minibatchSize=size_t(this->minibatchSz_);//temporary variable to avoid int to size_t arithmentic
        classProbabilities.create(Size(int(outputSize),int(allImageVector.size())),CV_32F);
        Mat outputMat = classProbabilities.getMat();
        for(size_t imgNum=0;imgNum<allImageVector.size();imgNum+=minibatchSize){
            size_t rangeEnd=imgNum+std::min<size_t>(allImageVector.size()-imgNum,minibatchSize);
            std::vector<Mat>::const_iterator from=std::vector<Mat>::const_iterator(allImageVector.begin()+imgNum);
            std::vector<Mat>::const_iterator to=std::vector<Mat>::const_iterator(allImageVector.begin()+rangeEnd);
            std::vector<Mat> minibatchInput(from,to);
            classifyMiniBatch(minibatchInput,outputMat.rowRange(int(imgNum),int(rangeEnd)));
        }
    }
    int getOutputSize(){
        return this->outputSize_;
    }
    int getMinibatchSize(){
        return this->minibatchSz_;
    }
    bool usingGpu(){
        return this->gpuBackend_;
    }
    int getBackend(){
        return OCR_HOLISTIC_BACKEND_CAFFE;
    }
 };
 Ptr<DictNet> DictNet::create(String archFilename,String weightsFilename,int minibatchSz,bool useGpu,int backEnd){
    switch(backEnd){
    case OCR_HOLISTIC_BACKEND_CAFFE:
        return Ptr<DictNet>(new DictNetCaffeImpl(archFilename, weightsFilename, minibatchSz, useGpu));
        break;
    case OCR_HOLISTIC_BACKEND_NONE:
    default:
        CV_Error(Error::StsError,"DictNet::create backend not implemented");
        return Ptr<DictNet>();
        break;
    }
 }
 class OCRHolisticWordRecognizerImpl: public OCRHolisticWordRecognizer{
 private:
-    struct NetOutput{
+    dnn::Net net;
-        //Auxiliary structure that handles the logic of getting class ids and probabillities from
+    vector<string> words;
        //the raw outputs of caffe
        int wordIdx;
        float probabillity;
        static bool sorter(const NetOutput& o1,const NetOutput& o2){//used with std::sort to provide the most probable class
            return o1.probabillity>o2.probabillity;
        }
        static void getOutputs(const float* buffer,int nbOutputs,std::vector<NetOutput>& res){
            res.resize(nbOutputs);
            for(int k=0;k<nbOutputs;k++){
                res[k].wordIdx=k;
                res[k].probabillity=buffer[k];
            }
            std::sort(res.begin(),res.end(),NetOutput::sorter);
        }
        static void getClassification(const float* buffer,int nbOutputs,int &classNum,double& confidence){
            std::vector<NetOutput> tmp;
            getOutputs(buffer,nbOutputs,tmp);
            classNum=tmp[0].wordIdx;
            confidence=tmp[0].probabillity;
        }
    };
 protected:
    std::vector<String> labels_;
    Ptr<TextImageClassifier> classifier_;
 public:
-    OCRHolisticWordRecognizerImpl(Ptr<TextImageClassifier> classifierPtr,String vocabullaryFilename):classifier_(classifierPtr){
+    OCRHolisticWordRecognizerImpl(const string &archFilename, const string &weightsFilename, const string &wordsFilename)
-        CV_Assert(fileExists(vocabullaryFilename));//this fails for some rason
+    {
-        std::ifstream labelsFile(vocabullaryFilename.c_str());
+        net = dnn::readNetFromCaffe(archFilename, weightsFilename);
-        if(!labelsFile){
+        std::ifstream in(wordsFilename.c_str());
-            CV_Error(Error::StsError,"Could not read Labels from file");
+        if (!in)
        {
            CV_Error(Error::StsError, "Could not read Labels from file");
        }
        std::string line;
-        while (std::getline(labelsFile, line)){
+        while (std::getline(in, line))
-            labels_.push_back(std::string(line));
+            words.push_back(line);
-        }
+        CV_Assert(getClassCount() == words.size());
        CV_Assert(this->classifier_->getOutputSize()==int(this->labels_.size()));
    }
    void recogniseImage(InputArray inputImage,CV_OUT String& transcription,CV_OUT double& confidence){
        Mat netOutput;
        this->classifier_->classify(inputImage,netOutput);
        int classNum;
        NetOutput::getClassification((float*)(netOutput.data),this->classifier_->getOutputSize(),classNum,confidence);
        transcription=this->labels_[classNum];
    }
    void recogniseImageBatch(InputArrayOfArrays inputImageList,CV_OUT std::vector<String>& transcriptionVec,CV_OUT std::vector<double>& confidenceVec){
        Mat netOutput;
        this->classifier_->classifyBatch(inputImageList,netOutput);
        for(int k=0;k<netOutput.rows;k++){
            int classNum;
            double confidence;
            NetOutput::getClassification((float*)(netOutput.row(k).data),this->classifier_->getOutputSize(),classNum,confidence);
            transcriptionVec.push_back(this->labels_[classNum]);
            confidenceVec.push_back(confidence);
        }
    }
-    void run(Mat& image, std::string& output_text, std::vector<Rect>* component_rects=NULL,
+    void run(Mat& image, std::string& output_text, std::vector<Rect>* component_rects=NULL, std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL, int component_level=0)
-             std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
+    {
-             int component_level=0){
+        CV_Assert(component_level==OCR_LEVEL_WORD); //Componnents not applicable for word spotting
        CV_Assert(component_level==OCR_LEVEL_WORD);//Componnents not applicable for word spotting
        double confidence;
-        String transcription;
+        output_text = classify(image, confidence);
        recogniseImage(image,transcription,confidence);
        output_text=transcription.c_str();
        if(component_rects!=NULL){
            component_rects->resize(1);
            (*component_rects)[0]=Rect(0,0,image.size().width,image.size().height);
        }
        if(component_texts!=NULL){
            component_texts->resize(1);
-            (*component_texts)[0]=transcription.c_str();
+            (*component_texts)[0] = output_text;
        }
        if(component_confidences!=NULL){
            component_confidences->resize(1);
-            (*component_confidences)[0]=float(confidence);
+            (*component_confidences)[0] = float(confidence);
        }
    }
-    void run(Mat& image, Mat& mask, std::string& output_text, std::vector<Rect>* component_rects=NULL,
+
-             std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
+    void run(Mat& image, Mat& mask, std::string& output_text, std::vector<Rect>* component_rects=NULL, std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL, int component_level=0)
-             int component_level=0){
+    {
-        CV_Assert(mask.cols==image.cols && mask.rows== image.rows);//Mask is ignored because the CNN operates on a full image
+        //Mask is ignored because the CNN operates on a full image
-        this->run(image,output_text,component_rects,component_texts,component_confidences,component_level);
+        CV_Assert(mask.cols == image.cols && mask.rows == image.rows);
        this->run(image, output_text, component_rects, component_texts, component_confidences, component_level);
    }
-    std::vector<String>& getVocabulary(){
+
-        return this->labels_;
+protected:
    Size getPerceptiveField() const
    {
        return Size(100, 32);
    }
    size_t getClassCount()
    {
        int id = net.getLayerId("prob");
        dnn::MatShape inputShape;
        inputShape.push_back(1);
        inputShape.push_back(1);
        inputShape.push_back(getPerceptiveField().height);
        inputShape.push_back(getPerceptiveField().width);
        vector<dnn::MatShape> inShapes, outShapes;
        net.getLayerShapes(inputShape, id, inShapes, outShapes);
        CV_Assert(outShapes.size() == 1 && outShapes[0].size() == 4);
        CV_Assert(outShapes[0][0] == 1 && outShapes[0][2] == 1 && outShapes[0][3] == 1);
        return outShapes[0][1];
    }
    string classify(InputArray image, double & conf)
    {
        CV_Assert(image.channels() == 1 && image.depth() == CV_8U);
        Mat resized;
        resize(image, resized, getPerceptiveField());
        Mat blob = dnn::blobFromImage(resized);
        net.setInput(blob, "data");
        Mat prob = net.forward("prob");
        CV_Assert(prob.dims == 4 && !prob.empty() && prob.size[1] == (int)getClassCount());
        int idx[4] = {0};
        minMaxIdx(prob, 0, &conf, 0, idx);
        CV_Assert(0 <= idx[1] && idx[1] < (int)words.size());
        return words[idx[1]];
    }
 };
-Ptr<OCRHolisticWordRecognizer> OCRHolisticWordRecognizer::create(Ptr<TextImageClassifier> classifierPtr,String vocabullaryFilename ){
+};
    return Ptr<OCRHolisticWordRecognizer>(new OCRHolisticWordRecognizerImpl(classifierPtr,vocabullaryFilename));
 }
-Ptr<OCRHolisticWordRecognizer> OCRHolisticWordRecognizer::create(String modelArchFilename, String modelWeightsFilename, String vocabullaryFilename){
+Ptr<OCRHolisticWordRecognizer> OCRHolisticWordRecognizer::create(const string &archFilename, const string &weightsFilename, const string &wordsFilename)
-    Ptr<TextImageClassifier> classifierPtr(new DictNetCaffeImpl(modelArchFilename,modelWeightsFilename, 100,0));
+{
-    return Ptr<OCRHolisticWordRecognizer>(new OCRHolisticWordRecognizerImpl(classifierPtr,vocabullaryFilename));
+    return makePtr<OCRHolisticWordRecognizerImpl>(archFilename, weightsFilename, wordsFilename);
 }
-}  } //namespace text namespace cv
+}} // cv::text::
--- a/modules/text/text_config.hpp.in
+++ b/modules/text/text_config.hpp.in
@ -1,13 +1,7 @@
 #ifndef __OPENCV_TEXT_CONFIG_HPP__
 #define __OPENCV_TEXT_CONFIG_HPP__
 // HAVE CAFFE
 #cmakedefine HAVE_CAFFE
 // HAVE OCR Tesseract
 #cmakedefine HAVE_TESSERACT
 #endif